From f28ea561584d0bc39b23ef68c83d83376ab3f11d Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Wed, 25 Sep 2024 12:27:25 +0200 Subject: [PATCH 1/4] New upstream version 2.34.0+dfsg --- .github/workflows/build-all.yml | 46 + .github/workflows/build-x86.yml | 37 + .github/workflows/ci.yml | 158 +- .github/workflows/linux-packages.yml | 15 - .github/workflows/update-manpage.yml | 33 + .gitignore | 1 + CMakeLists.txt | 246 +- README.md | 85 +- common/cmdline.h | 101 - common/integers.h | 221 -- common/main.cc | 169 - common/output-file-unix.h | 144 - common/output-file-win32.h | 24 - common/output-file.h | 5 - common/uuid.cc | 20 - dist.sh | 110 +- docs/chart.svg | 1 + docs/comparison.png | Bin 72057 -> 0 bytes docs/design.md | 5 + docs/mold.1 | 104 +- docs/mold.md | 201 +- elf/arch-alpha.cc | 330 -- elf/input-sections.cc | 555 --- elf/linker-script.cc | 428 --- elf/lto.cc | 5 - install-build-deps.sh | 46 +- install-cross-tools.sh | 2 +- {common => lib}/archive-file.h | 20 +- {common => lib}/common.h | 471 +-- {common => lib}/compress.cc | 8 +- {common => lib}/config.h.in | 3 +- lib/crc32.cc | 60 + {common => lib}/demangle.cc | 29 +- {common => lib}/filepath.cc | 34 + {test => lib}/gentoo-test.sh | 9 +- {common => lib}/glob.cc | 13 +- {common => lib}/hyperloglog.cc | 0 lib/integers.h | 144 + elf/jobs.cc => lib/jobs-unix.cc | 43 +- lib/jobs-win32.cc | 6 + lib/malloc.cc | 5 + lib/mapped-file-unix.cc | 51 + lib/mapped-file-win32.cc | 81 + {common => lib}/multi-glob.cc | 82 +- {common => lib}/perf.cc | 8 +- lib/random.cc | 20 + lib/signal-unix.cc | 88 + lib/signal-win32.cc | 55 + lib/siphash.h | 144 + {common => lib}/tar.cc | 65 +- {common => lib}/update-git-hash.cmake | 0 {elf => src}/arch-arm32.cc | 153 +- {elf => src}/arch-arm64.cc | 52 +- {elf => src}/arch-i386.cc | 90 +- {elf => src}/arch-loongarch.cc | 504 ++- {elf => src}/arch-m68k.cc | 16 +- {elf => src}/arch-ppc32.cc | 31 +- {elf => src}/arch-ppc64v1.cc | 59 +- {elf => src}/arch-ppc64v2.cc | 145 +- {elf => src}/arch-riscv.cc | 410 +- {elf => src}/arch-s390x.cc | 43 +- {elf => src}/arch-sh4.cc | 38 +- {elf => src}/arch-sparc64.cc | 137 +- {elf => src}/arch-x86-64.cc | 217 +- {elf => src}/cmdline.cc | 495 ++- src/config.cc | 13 + {elf => src}/elf.cc | 65 +- {elf => src}/elf.h | 186 +- {common => src}/filetype.h | 102 +- {elf => src}/gc-sections.cc | 106 +- {elf => src}/gdb-index.cc | 17 +- {elf => src}/icf.cc | 97 +- {elf => src}/input-files.cc | 678 ++-- src/input-sections.cc | 440 +++ src/linker-script.cc | 424 +++ {elf => src}/lto-unix.cc | 261 +- {elf => src}/lto-win32.cc | 12 +- {elf => src}/lto.h | 8 +- {elf => src}/main.cc | 377 +- {elf => src}/mapfile.cc | 9 +- {elf => src}/mold-wrapper.c | 18 +- {elf => src}/mold.h | 855 +++-- {elf => src}/output-chunks.cc | 1208 +++--- src/output-file-unix.cc | 200 + src/output-file-win32.cc | 118 + {elf => src}/passes.cc | 1509 +++++--- {elf => src}/relocatable.cc | 10 +- src/shrink-sections.cc | 151 + elf/subprocess.cc => src/subprocess-unix.cc | 31 +- src/subprocess-win32.cc | 20 + {elf => src}/thunks.cc | 77 +- {elf => src}/tls.cc | 55 +- test/{elf => }/CMakeLists.txt | 69 +- test/{elf => }/abs-error.sh | 1 - test/{elf => }/absolute-symbols.sh | 4 +- test/{elf => }/allow-multiple-definition.sh | 0 test/{elf => }/ar-alignment.sh | 0 ...ch64-range-extension-thunk-disassembly.sh} | 2 - ...ant-pcs.sh => arch-aarch64-variant-pcs.sh} | 2 - test/arch-arm-abs-error.sh | 18 + ...-arm-range-extension-thunk-disassembly.sh} | 2 - ...k.sh => arch-arm-range-extension-thunk.sh} | 2 - ...terwork.sh => arch-arm-thumb-interwork.sh} | 2 - .../arm_tlsdesc.sh => arch-arm-tlsdesc.sh} | 1 - ...e-base.sh => arch-i686-tls-module-base.sh} | 2 - test/arch-i686-tlsdesc.sh | 48 + ...sh => arch-loongarch64-mcmodel-extreme.sh} | 0 test/arch-loongarch64-relax-call36.sh | 52 + test/arch-loongarch64-relax-got-load.sh | 33 + test/arch-loongarch64-relax-pcala-addi.sh | 58 + test/arch-loongarch64-relax-tlsdesc.sh | 43 + ...a.sh => arch-ppc64le-save-restore-gprs.sh} | 4 +- ...tributes.sh => arch-riscv64-attributes.sh} | 0 ...ibutes2.sh => arch-riscv64-attributes2.sh} | 0 test/arch-riscv64-global-pointer-dso.sh | 27 + test/arch-riscv64-global-pointer.sh | 26 + ...riscv64_norvc.sh => arch-riscv64-norvc.sh} | 2 - ...ible.sh => arch-riscv64-obj-compatible.sh} | 0 test/arch-riscv64-relax-got.sh | 79 + ...lax-hi20.sh => arch-riscv64-relax-hi20.sh} | 4 +- ...ak-undef.sh => arch-riscv64-weak-undef.sh} | 0 test/{elf/s390x_got.sh => arch-s390x-got.sh} | 6 +- test/arch-x86_64-address-equality.sh | 28 + ...=> arch-x86_64-empty-mergeable-section.sh} | 0 ....sh => arch-x86_64-emulation-deduction.sh} | 3 - ...=> arch-x86_64-exception-mcmodel-large.sh} | 4 +- ....sh => arch-x86_64-execstack-if-needed.sh} | 2 - ...inkonce.sh => arch-x86_64-gnu-linkonce.sh} | 2 - ...nu-retain.sh => arch-x86_64-gnu-retain.sh} | 2 - ..._gotpcrelx.sh => arch-x86_64-gotpcrelx.sh} | 0 ...nc-alias.sh => arch-x86_64-ifunc-alias.sh} | 0 ...-x86_64-incompatible-libs-linker-script.sh | 29 + ...x86_64-incompatible-libs-linker-script2.sh | 32 + ...bs.sh => arch-x86_64-incompatible-libs.sh} | 2 - ...2.sh => arch-x86_64-incompatible-libs2.sh} | 2 - ...obj.sh => arch-x86_64-incompatible-obj.sh} | 2 - ....sh => arch-x86_64-init-array-readonly.sh} | 2 - ...nit-array.sh => arch-x86_64-init-array.sh} | 2 - test/arch-x86_64-isa-level.sh | 17 + ..._large-bss.sh => arch-x86_64-large-bss.sh} | 2 - ...ds.sh => arch-x86_64-mergeable-records.sh} | 3 - .../arch-x86_64-mergeable-strings-nonalloc.sh | 23 + ...gs.sh => arch-x86_64-mergeable-strings.sh} | 3 - ...operty.sh => arch-x86_64-note-property.sh} | 3 - ...erty2.sh => arch-x86_64-note-property2.sh} | 3 - .../x86_64_note.sh => arch-x86_64-note.sh} | 4 +- .../x86_64_note2.sh => arch-x86_64-note2.sh} | 4 +- .../{elf/x86_64_plt.sh => arch-x86_64-plt.sh} | 2 - ...-array.sh => arch-x86_64-preinit-array.sh} | 2 - .../x86_64_relax.sh => arch-x86_64-relax.sh} | 1 - ...rflow.sh => arch-x86_64-reloc-overflow.sh} | 2 - ...eloc-zero.sh => arch-x86_64-reloc-zero.sh} | 2 - .../x86_64_reloc.sh => arch-x86_64-reloc.sh} | 2 - ...nt.sh => arch-x86_64-section-alignment.sh} | 2 - ...on-name.sh => arch-x86_64-section-name.sh} | 2 - test/arch-x86_64-tbss-only.sh | 19 + ...sh => arch-x86_64-tls-gd-mcmodel-large.sh} | 3 - ...d-to-ie.sh => arch-x86_64-tls-gd-to-ie.sh} | 0 ...-tbss.sh => arch-x86_64-tls-large-tbss.sh} | 2 - ...sh => arch-x86_64-tls-ld-mcmodel-large.sh} | 2 - ...base.sh => arch-x86_64-tls-module-base.sh} | 2 +- test/arch-x86_64-tlsdesc.sh | 47 + ...x86_64_unique.sh => arch-x86_64-unique.sh} | 2 - ...stack.sh => arch-x86_64-warn-execstack.sh} | 4 +- ....sh => arch-x86_64-warn-shared-textrel.sh} | 3 - ...textrel.sh => arch-x86_64-warn-textrel.sh} | 3 - .../x86_64_z-ibt.sh => arch-x86_64-z-ibt.sh} | 1 - ...64_z-ibtplt.sh => arch-x86_64-z-ibtplt.sh} | 2 - ...ndbr.sh => arch-x86_64-z-rewrite-endbr.sh} | 1 - test/arch-x86_64-z-rewrite-endbr2.sh | 27 + test/arch-x86_64-z-rewrite-endbr3.sh | 19 + ...6_64_z-shstk.sh => arch-x86_64-z-shstk.sh} | 1 - ...x86_64_z-text.sh => arch-x86_64-z-text.sh} | 3 - test/{elf => }/as-needed-dso.sh | 2 +- test/{elf => }/as-needed-dso2.sh | 0 test/{elf => }/as-needed-weak.sh | 6 +- test/{elf => }/as-needed.sh | 12 +- test/{elf => }/auxiliary.sh | 0 test/{elf => }/bno-symbolic.sh | 2 +- test/{elf => }/bsymbolic-functions.sh | 0 test/bsymbolic-non-weak-functions.sh | 42 + test/bsymbolic-non-weak.sh | 42 + test/{elf => }/bsymbolic.sh | 0 test/{elf => }/build-id.sh | 3 + test/{elf => }/canonical-plt.sh | 2 +- test/{elf => }/cmdline.sh | 0 test/{elf => }/color-diagnostics.sh | 0 test/{elf => }/comment.sh | 2 +- test/{elf => }/common-archive.sh | 0 test/{elf => }/common-ref.sh | 0 test/{elf/common.sh => common-symbols.sh} | 0 test/{elf => }/common.inc | 52 +- .../{elf => }/compress-debug-sections-zstd.sh | 0 test/{elf => }/compress-debug-sections.sh | 0 test/{elf => }/compressed-debug-info.sh | 0 test/{elf => }/copyrel-alignment.sh | 1 - test/copyrel-norelro.sh | 27 + test/{elf => }/copyrel-protected.sh | 3 +- test/{elf => }/copyrel-relro.sh | 0 test/{elf => }/copyrel-relro2.sh | 0 test/{elf => }/copyrel.sh | 0 test/{elf => }/ctors-in-init-array.sh | 0 test/{elf => }/dead-debug-sections.sh | 0 test/{elf => }/debug-macro-section.sh | 0 test/{elf => }/default-symver.sh | 0 test/{elf => }/defsym-lto.sh | 3 +- test/{elf => }/defsym-missing-symbol.sh | 0 test/{elf => }/defsym.sh | 0 test/{elf => }/defsym2.sh | 0 test/demangle-cpp.sh | 19 + test/{elf => }/demangle-rust.sh | 0 test/{elf => }/demangle.sh | 0 test/dependency-file-response-file.sh | 17 + test/{elf => }/dependency-file.sh | 0 test/{elf => }/disable-new-dtags.sh | 0 test/{elf => }/discard.sh | 3 +- test/{elf => }/dso-undef.sh | 0 test/{elf => }/dt-init.sh | 0 test/{elf => }/dt-needed.sh | 0 test/{elf => }/duplicate-error-archive.sh | 0 test/{elf => }/duplicate-error.sh | 0 test/{elf => }/dynamic-dt-debug.sh | 0 test/{elf => }/dynamic-linker.sh | 0 test/dynamic-list-data.sh | 13 + test/{elf => }/dynamic-list.sh | 0 test/{elf => }/dynamic-list2.sh | 0 test/{elf => }/dynamic-list3.sh | 0 test/{elf => }/dynamic-list4.sh | 0 test/{elf => }/dynamic.sh | 2 +- test/elf/mold-wrapper2.sh | 13 - test/elf/now.sh | 17 - test/elf/pack-dyn-relocs-relr.sh | 27 - test/elf/relocatable-no-ehframe.sh | 18 - test/elf/run.sh | 52 - test/elf/shared-abs-sym.sh | 29 - test/elf/z-pack-relative-relocs.sh | 16 - test/elf/z-start-stop-visibility.sh | 6 - test/{elf => }/emit-relocs-cpp.sh | 0 test/{elf => }/emit-relocs-dead-sections.sh | 0 test/{elf => }/emit-relocs.sh | 0 test/empty-arg.sh | 5 + test/{elf => }/empty-file.sh | 0 test/{elf => }/empty-input.sh | 0 test/{elf => }/empty-version.sh | 0 test/{elf => }/entry.sh | 0 test/exception-multiple-ehframe.sh | 48 + test/{elf => }/exception.sh | 5 +- test/{elf => }/exclude-libs.sh | 6 + test/{elf => }/exclude-libs2.sh | 0 test/{elf => }/exclude-libs3.sh | 0 test/{elf => }/execstack.sh | 0 test/{elf => }/execute-only.sh | 1 + test/{elf => }/export-dynamic.sh | 0 test/{elf => }/export-from-exe.sh | 0 test/{elf => }/fatal-warnings.sh | 0 test/{elf => }/filler.sh | 0 test/{elf => }/filter.sh | 0 test/{elf => }/func-addr.sh | 0 test/{elf => }/gc-sections.sh | 0 test/{elf => }/gdb-index-compress-output.sh | 2 +- test/{elf => }/gdb-index-dwarf2.sh | 2 +- test/{elf => }/gdb-index-dwarf3.sh | 2 +- test/{elf => }/gdb-index-dwarf4.sh | 2 +- test/{elf => }/gdb-index-dwarf5.sh | 5 +- test/{elf => }/gdb-index-dwarf64.sh | 2 +- test/{elf => }/gdb-index-empty.sh | 0 test/{elf => }/gdb-index-split-dwarf.sh | 2 +- test/{elf => }/glibc-2.22-bug.sh | 1 - test/{elf => }/global-offset-table.sh | 0 test/{elf => }/gnu-hash.sh | 0 test/gnu-property.sh | 10 + test/gnu-retain.sh | 18 + test/{elf => }/gnu-unique.sh | 0 test/{elf => }/gnu-warning.sh | 0 test/{elf => }/hash-style.sh | 0 test/{elf => }/hello-dynamic.sh | 0 test/{elf => }/hello-static.sh | 0 test/{elf => }/help.sh | 0 test/hidden-archive.sh | 21 + test/{elf => }/hidden-undef.sh | 0 test/{elf => }/hidden-weak-undef.sh | 0 test/{elf => }/icf-safe.sh | 0 test/{elf => }/icf-small.sh | 0 test/{elf => }/icf.sh | 0 .../ifunc-address-equality-exported.sh | 0 test/{elf => }/ifunc-address-equality.sh | 0 test/{elf => }/ifunc-alias.sh | 0 test/{elf => }/ifunc-dlopen.sh | 0 test/{elf => }/ifunc-dso.sh | 0 test/{elf => }/ifunc-dynamic.sh | 0 test/{elf => }/ifunc-export.sh | 0 test/{elf => }/ifunc-funcptr.sh | 0 test/{elf => }/ifunc-noplt.sh | 0 test/{elf => }/ifunc-static-pie.sh | 0 test/{elf => }/ifunc-static.sh | 0 test/{elf => }/image-base.sh | 0 test/{elf => }/init-array-priorities.sh | 0 test/{elf => }/init-in-dso.sh | 0 test/{elf => }/init.sh | 0 test/{elf => }/initfirst.sh | 0 test/{elf => }/interpose.sh | 0 test/{elf => }/invalid-version-script.sh | 0 test/{elf => }/issue646.sh | 3 - test/{elf => }/large-alignment-dso.sh | 0 test/{elf => }/large-alignment.sh | 0 test/{elf => }/large-max-page-size-strip.sh | 0 test/{elf => }/large-max-page-size.sh | 0 test/{elf => }/large-text.sh | 0 test/library.sh | 22 + test/{elf => }/link-order.sh | 0 test/{elf => }/linker-script-defsym.sh | 0 test/linker-script-error.sh | 11 + test/{elf => }/linker-script-relocatable.sh | 0 test/{elf => }/linker-script.sh | 0 test/{elf => }/linker-script2.sh | 0 test/{elf => }/linker-script3.sh | 0 test/{elf => }/linker-script4.sh | 0 test/linker-script5.sh | 14 + test/linker-script6.sh | 15 + test/{elf => }/lto-archive.sh | 4 +- test/lto-archive2.sh | 15 + test/{elf => }/lto-dso.sh | 3 +- test/{elf => }/lto-gcc.sh | 2 +- test/{elf => }/lto-llvm.sh | 2 +- test/{elf => }/lto-nostdlib.sh | 2 + test/{elf => }/lto-version-script.sh | 2 + test/{elf => }/main-in-dso.sh | 0 test/{elf => }/many-sections.sh | 0 test/{elf => }/many-sections2.sh | 2 +- test/{elf => }/mergeable-strings.sh | 0 test/{elf => }/missing-but-ok.sh | 0 test/{elf => }/missing-error.sh | 0 test/{elf => }/mold-wrapper.sh | 2 +- test/mold-wrapper2.sh | 7 + test/nmagic.sh | 14 + test/no-allow-shlib-undefined.sh | 21 + test/{elf => }/no-eh-frame-header.sh | 0 test/{elf/bug178.sh => no-object-file.sh} | 0 test/{elf => }/no-quick-exit.sh | 0 test/{elf => }/no-undefined-version.sh | 0 test/{elf => }/nocopyreloc.sh | 1 - test/{elf => }/noinhibit-exec.sh | 0 test/{elf => }/non-canonical-plt.sh | 0 test/{elf => }/nostdlib.sh | 0 test/{elf => }/oformat-binary.sh | 0 test/{elf => }/omagic.sh | 0 test/package-metadata.sh | 18 + test/{elf => }/physical-image-base.sh | 0 test/{elf => }/pie.sh | 0 test/{elf => }/plt-dso.sh | 0 test/{elf => }/pltgot.sh | 0 test/{elf => }/preinit-array.sh | 0 test/{elf => }/print-dependencies.sh | 0 test/{elf => }/protected-dynsym.sh | 0 test/{elf => }/protected.sh | 0 test/{elf => }/push-pop-state.sh | 0 test/{elf => }/range-extension-thunk.sh | 8 +- test/{elf => }/range-extension-thunk2.sh | 0 test/range-extension-thunk3.sh | 16 + test/{elf => }/relax-got-load.sh | 0 test/{elf => }/reloc-rodata.sh | 0 test/{elf => }/relocatable-archive.sh | 0 test/{elf => }/relocatable-c++.sh | 4 - test/relocatable-compressed-debug-info.sh | 21 + test/{elf => }/relocatable-debug-info.sh | 5 +- test/{elf => }/relocatable-exception.sh | 0 test/{elf => }/relocatable-many-sections.sh | 0 test/{elf => }/relocatable-merge-sections.sh | 0 .../relocatable-mergeable-sections.sh | 0 test/{elf => }/relocatable.sh | 0 test/{elf => }/relro.sh | 0 test/{elf => }/repro.sh | 6 +- test/{elf => }/require-defined.sh | 0 test/{elf => }/response-file.sh | 0 test/{elf => }/response-file2.sh | 0 test/{elf => }/retain-symbols-file.sh | 8 +- test/{elf => }/reverse-sections.sh | 0 test/{elf => }/rodata-name.sh | 0 test/{elf => }/rosegment.sh | 0 test/{elf => }/rpath.sh | 0 test/{elf => }/run-clang.sh | 2 +- test/run.sh | 52 + test/{elf => }/section-align.sh | 0 test/section-attributes.sh | 24 + test/{elf => }/section-order.sh | 3 +- test/{elf => }/section-start.sh | 0 test/separate-debug-file.sh | 28 + test/shared-abs-sym.sh | 30 + test/{elf => }/shared.sh | 0 test/{elf => }/shuffle-sections-seed.sh | 0 test/{elf => }/shuffle-sections.sh | 0 test/{elf => }/soname.sh | 0 test/spare-program-headers.sh | 25 + test/{elf => }/start-lib.sh | 0 test/{elf => }/start-stop-symbol.sh | 0 test/{elf => }/start-stop.sh | 0 test/{elf => }/static-archive.sh | 0 test/{elf => }/static-pie.sh | 0 test/{elf => }/stdout.sh | 0 test/{elf => }/strip-debug.sh | 0 test/{elf => }/strip.sh | 4 +- test/stt-common.sh | 26 + test/{elf => }/symbol-rank.sh | 0 test/{elf => }/symbol-version-lto.sh | 2 + test/{elf => }/symbol-version.sh | 0 test/{elf => }/symbol-version2.sh | 0 test/{elf => }/symbol-version3.sh | 0 test/symbol-version4.sh | 58 + test/{elf => }/symtab-dso.sh | 0 test/{elf => }/symtab-section-symbols.sh | 0 test/{elf => }/symtab.sh | 0 test/{elf => }/synthetic-symbols.sh | 0 test/{elf => }/sysroot-linker-script.sh | 0 test/{elf => }/sysroot.sh | 0 test/{elf => }/sysroot2.sh | 0 test/{elf => }/tail-call.sh | 0 test/tbss-only.sh | 14 + test/{elf => }/thin-archive.sh | 2 +- test/{elf => }/thread-count.sh | 0 test/{elf => }/tls-alignment-multi.sh | 0 test/{elf => }/tls-common.sh | 1 + test/{elf => }/tls-df-static-tls.sh | 0 test/{elf => }/tls-dso.sh | 0 test/{elf => }/tls-gd-dlopen.sh | 0 test/{elf => }/tls-gd-noplt.sh | 1 - test/{elf => }/tls-gd-to-ie.sh | 0 test/{elf => }/tls-gd.sh | 1 - test/{elf => }/tls-ie.sh | 0 test/{elf => }/tls-irregular-start-addr.sh | 0 test/{elf => }/tls-large-alignment.sh | 0 test/{elf => }/tls-large-static-image.sh | 0 test/{elf => }/tls-ld-noplt.sh | 0 test/{elf => }/tls-ld.sh | 0 test/{elf => }/tls-le-error.sh | 0 test/{elf => }/tls-le.sh | 8 +- test/{elf => }/tls-nopic.sh | 0 test/{elf => }/tls-pic.sh | 0 test/{elf => }/tls-small-alignment.sh | 0 test/{elf => }/tlsdesc-dlopen.sh | 0 test/{elf => }/tlsdesc-import.sh | 0 test/{elf => }/tlsdesc-initial-exec.sh | 12 +- test/{elf => }/tlsdesc-local-dynamic.sh | 0 test/{elf => }/tlsdesc-static.sh | 0 test/{elf => }/tlsdesc.sh | 0 test/trace-symbol-symver.sh | 30 + test/{elf => }/trace-symbol.sh | 0 test/{elf => }/trace.sh | 0 test/undefined-glob-gc-sections.sh | 29 + test/undefined-glob.sh | 35 + test/{elf => }/undefined.sh | 0 test/{elf => }/undefined2.sh | 0 test/unkown-section-type.sh | 9 + test/{elf => }/unresolved-symbols.sh | 0 test/unresolved-symbols2.sh | 10 + test/{elf => }/verbose.sh | 0 test/{elf => }/version-script-search-paths.sh | 0 test/{elf => }/version-script.sh | 0 test/{elf => }/version-script10.sh | 0 test/{elf => }/version-script11.sh | 0 test/{elf => }/version-script12.sh | 0 test/{elf => }/version-script13.sh | 0 test/{elf => }/version-script14.sh | 0 test/{elf => }/version-script15.sh | 0 test/{elf => }/version-script16.sh | 0 test/{elf => }/version-script17.sh | 0 test/{elf => }/version-script18.sh | 0 test/{elf => }/version-script19.sh | 0 test/{elf => }/version-script2.sh | 0 test/version-script20.sh | 19 + test/version-script21.sh | 19 + test/version-script22.sh | 15 + test/version-script23.sh | 15 + test/{elf => }/version-script3.sh | 0 test/{elf => }/version-script4.sh | 0 test/{elf => }/version-script5.sh | 0 test/{elf => }/version-script6.sh | 0 test/{elf => }/version-script7.sh | 0 test/{elf => }/version-script8.sh | 0 test/{elf => }/version-script9.sh | 0 test/{elf => }/version.sh | 10 +- test/{elf => }/versioned-undef.sh | 0 test/{elf => }/visibility.sh | 0 test/{elf => }/warn-common.sh | 0 test/{elf => }/warn-once.sh | 2 +- test/{elf => }/warn-symbol-type.sh | 0 test/{elf => }/warn-unresolved-symbols.sh | 0 test/{elf => }/weak-export-dso.sh | 0 test/weak-export-dso2.sh | 21 + test/{elf => }/weak-export-exe.sh | 0 test/{elf => }/weak-undef-dso.sh | 0 test/{elf => }/weak-undef.sh | 0 test/{elf => }/weak-undef2.sh | 0 test/{elf => }/weak-undef4.sh | 0 test/weak-undef5.sh | 21 + test/{elf => }/whole-archive.sh | 18 +- test/{elf => }/wrap-lto.sh | 2 + test/{elf => }/wrap.sh | 0 test/{elf => }/z-cet-report.sh | 0 test/{elf => }/z-defs.sh | 0 test/{elf => }/z-dynamic-undefined-weak.sh | 0 test/{elf => }/z-max-page-size.sh | 0 test/{elf => }/z-nodefaultlib.sh | 0 test/{elf => }/z-nodump.sh | 0 test/{elf => }/z-now.sh | 0 test/{elf => }/z-origin.sh | 0 test/z-pack-relative-relocs.sh | 21 + test/z-rodynamic.sh | 12 + test/{elf => }/z-sectionheader.sh | 0 test/{elf => }/z-separate-code.sh | 0 test/{elf => }/z-stack-size.sh | 0 test/z-start-stop-visibility.sh | 28 + test/{elf => }/z-unknown.sh | 0 third-party/blake3/.git-blame-ignore-revs | 2 + third-party/blake3/.github/workflows/ci.yml | 77 +- third-party/blake3/.github/workflows/tag.yml | 8 +- third-party/blake3/Cargo.toml | 37 +- third-party/blake3/README.md | 2 + third-party/blake3/b3sum/Cargo.lock | 482 +-- third-party/blake3/b3sum/Cargo.toml | 5 +- third-party/blake3/b3sum/src/main.rs | 162 +- third-party/blake3/build.rs | 22 +- third-party/blake3/c/CMakeLists.txt | 67 +- third-party/blake3/c/blake3.c | 19 +- third-party/blake3/c/blake3.h | 2 +- .../c/blake3_c_rust_bindings/Cargo.toml | 6 +- .../c/blake3_c_rust_bindings/src/test.rs | 2 +- third-party/blake3/c/blake3_dispatch.c | 39 +- third-party/blake3/c/blake3_impl.h | 6 +- third-party/blake3/c/blake3_neon.c | 6 +- third-party/blake3/rust/guts/Cargo.toml | 18 + third-party/blake3/rust/guts/readme.md | 80 + third-party/blake3/rust/guts/src/lib.rs | 1000 +++++ third-party/blake3/rust/guts/src/portable.rs | 262 ++ third-party/blake3/rust/guts/src/test.rs | 523 +++ third-party/blake3/src/io.rs | 79 + third-party/blake3/src/lib.rs | 304 +- third-party/blake3/src/platform.rs | 24 + third-party/blake3/src/test.rs | 208 + third-party/blake3/tools/release.md | 2 +- third-party/mimalloc/.gitignore | 2 + third-party/mimalloc/CMakeLists.txt | 144 +- third-party/mimalloc/SECURITY.md | 41 + third-party/mimalloc/bin/readme.md | 71 + .../cmake/mimalloc-config-version.cmake | 2 +- third-party/mimalloc/doc/doxyfile | 2 +- third-party/mimalloc/doc/mimalloc-doc.h | 16 +- .../mimalloc/docker/alpine-arm32v7/Dockerfile | 28 + third-party/mimalloc/docker/alpine/Dockerfile | 23 + .../mimalloc/docker/manylinux-x64/Dockerfile | 23 + third-party/mimalloc/docker/readme.md | 10 + .../ide/vs2017/mimalloc-override.vcxproj | 1 + .../mimalloc/ide/vs2017/mimalloc.vcxproj | 1 + .../ide/vs2019/mimalloc-override.vcxproj | 1 + .../mimalloc/ide/vs2019/mimalloc.vcxproj | 1 + .../ide/vs2022/mimalloc-override.vcxproj | 1 + .../mimalloc/ide/vs2022/mimalloc.vcxproj | 7 + .../mimalloc/include/mimalloc-override.h | 3 +- third-party/mimalloc/include/mimalloc.h | 64 +- .../mimalloc/include/mimalloc/atomic.h | 18 +- .../mimalloc/include/mimalloc/internal.h | 101 +- third-party/mimalloc/include/mimalloc/prim.h | 116 +- third-party/mimalloc/include/mimalloc/track.h | 2 + third-party/mimalloc/include/mimalloc/types.h | 173 +- third-party/mimalloc/readme.md | 114 +- third-party/mimalloc/src/alloc-aligned.c | 136 +- third-party/mimalloc/src/alloc-override.c | 27 +- third-party/mimalloc/src/alloc.c | 582 +-- third-party/mimalloc/src/arena.c | 368 +- third-party/mimalloc/src/bitmap.c | 14 +- third-party/mimalloc/src/bitmap.h | 4 +- third-party/mimalloc/src/free.c | 530 +++ third-party/mimalloc/src/heap.c | 85 +- third-party/mimalloc/src/init.c | 93 +- third-party/mimalloc/src/libc.c | 273 ++ third-party/mimalloc/src/options.c | 129 +- third-party/mimalloc/src/os.c | 91 +- third-party/mimalloc/src/page-queue.c | 55 +- third-party/mimalloc/src/page.c | 110 +- .../mimalloc/src/prim/emscripten/prim.c | 244 ++ .../src/prim/osx/alloc-override-zone.c | 3 + third-party/mimalloc/src/prim/prim.c | 3 + third-party/mimalloc/src/prim/unix/prim.c | 146 +- third-party/mimalloc/src/prim/wasi/prim.c | 7 +- third-party/mimalloc/src/prim/windows/prim.c | 49 +- third-party/mimalloc/src/segment-map.c | 8 +- third-party/mimalloc/src/segment.c | 571 ++- third-party/mimalloc/src/static.c | 1 + third-party/mimalloc/src/stats.c | 60 +- third-party/mimalloc/test/main-override.cpp | 4 +- third-party/mimalloc/test/test-api.c | 108 +- third-party/mimalloc/test/test-stress.c | 21 +- third-party/tbb/.bazelversion | 2 +- third-party/tbb/.github/CODEOWNERS | 7 + third-party/tbb/.github/issue_labeler.yml | 27 + third-party/tbb/.github/workflows/ci.yml | 47 +- .../tbb/.github/workflows/issue_labeler.yml | 37 + third-party/tbb/.github/workflows/labeler.yml | 4 +- third-party/tbb/BUILD.bazel | 15 +- third-party/tbb/CMakeLists.txt | 95 +- third-party/tbb/CONTRIBUTING.md | 5 - third-party/tbb/INSTALL.md | 2 +- third-party/tbb/MODULE.bazel | 24 + third-party/tbb/README.md | 6 +- third-party/tbb/RELEASE_NOTES.md | 33 +- third-party/tbb/SECURITY.md | 69 +- third-party/tbb/SUPPORT.md | 35 + third-party/tbb/SYSTEM_REQUIREMENTS.md | 12 +- third-party/tbb/WASM_Support.md | 41 +- third-party/tbb/WORKSPACE.bazel | 4 +- third-party/tbb/cmake/README.md | 133 +- third-party/tbb/cmake/compilers/Clang.cmake | 26 +- third-party/tbb/cmake/compilers/GNU.cmake | 16 +- third-party/tbb/cmake/compilers/Intel.cmake | 8 +- .../tbb/cmake/compilers/IntelLLVM.cmake | 4 +- third-party/tbb/cmake/compilers/MSVC.cmake | 6 +- third-party/tbb/cmake/config_generation.cmake | 2 + third-party/tbb/cmake/hwloc_detection.cmake | 4 +- third-party/tbb/cmake/resumable_tasks.cmake | 31 + .../tbb/cmake/templates/TBBConfig.cmake.in | 2 + third-party/tbb/cmake/utils.cmake | 46 +- third-party/tbb/cmake/vars_utils.cmake | 12 +- third-party/tbb/doc/GSG/next_steps.rst | 6 + third-party/tbb/doc/conf.py | 6 +- third-party/tbb/doc/index/toctree.rst | 1 + third-party/tbb/doc/main/intro/Benefits.rst | 2 +- .../tbb/doc/main/intro/limitations.rst | 46 + .../tbb/doc/main/reference/reference.rst | 1 + .../tbb/doc/main/reference/rvalue_reduce.rst | 89 + .../tbb/doc/main/tbb_userguide/Edges.rst | 4 +- .../tbb_userguide/Flow_Graph_Reservation.rst | 10 +- .../Migration_Guide/Mixing_Two_Runtimes.rst | 1 + ...e_Contents.rst => Package_Contents_os.rst} | 3 +- .../Working_on_the_Assembly_Line_pipeline.rst | 2 +- .../tbb_userguide/concurrent_hash_map.rst | 6 +- .../tbb/doc/main/tbb_userguide/title.rst | 2 +- third-party/tbb/doc/make.bat | 2 +- third-party/tbb/examples/CMakeLists.txt | 7 +- third-party/tbb/examples/README.md | 1 + .../tbb/examples/common/gui/CMakeLists.txt | 4 +- .../count_strings/CMakeLists.txt | 4 +- .../count_strings/count_strings.cpp | 4 +- .../shortpath/CMakeLists.txt | 4 +- .../tbb/examples/getting_started/README.md | 2 +- .../sub_string_finder/CMakeLists.txt | 4 +- .../sub_string_finder/README.md | 2 +- .../tbb/examples/graph/binpack/CMakeLists.txt | 4 +- .../examples/graph/cholesky/CMakeLists.txt | 4 +- .../graph/dining_philosophers/CMakeLists.txt | 4 +- .../tbb/examples/graph/fgbzip2/CMakeLists.txt | 6 +- .../examples/graph/logic_sim/CMakeLists.txt | 4 +- .../tbb/examples/graph/som/CMakeLists.txt | 4 +- third-party/tbb/examples/migration/README.md | 6 + .../recursive_fibonacci/CMakeLists.txt | 40 + .../migration/recursive_fibonacci/README.md | 23 + .../recursive_fibonacci/fibonacci.cpp | 61 + .../fibonacci_single_task.h | 97 + .../recursive_fibonacci/fibonacci_two_tasks.h | 79 + .../task_emulation_layer.h | 225 ++ .../parallel_for/game_of_life/CMakeLists.txt | 4 +- .../polygon_overlay/CMakeLists.txt | 4 +- .../parallel_for/seismic/CMakeLists.txt | 4 +- .../parallel_for/tachyon/CMakeLists.txt | 5 +- .../parallel_for/tachyon/src/imageio.cpp | 7 +- .../parallel_for/tachyon/src/imageio.hpp | 4 +- .../parallel_for/tachyon/src/jpeg.cpp | 4 +- .../examples/parallel_for/tachyon/src/ppm.cpp | 4 +- .../examples/parallel_for/tachyon/src/ppm.hpp | 4 +- .../parallel_preorder/CMakeLists.txt | 4 +- .../parallel_pipeline/square/CMakeLists.txt | 4 +- .../tbb/examples/parallel_reduce/README.md | 1 + .../convex_hull/CMakeLists.txt | 4 +- .../parallel_reduce/pi/CMakeLists.txt | 33 + .../tbb/examples/parallel_reduce/pi/README.md | 24 + .../tbb/examples/parallel_reduce/pi/common.h | 51 + .../tbb/examples/parallel_reduce/pi/main.cpp | 100 + .../tbb/examples/parallel_reduce/pi/pi.cpp | 55 + .../parallel_reduce/primes/CMakeLists.txt | 4 +- .../task_arena/fractal/CMakeLists.txt | 4 +- .../examples/task_group/sudoku/CMakeLists.txt | 4 +- .../test_all/fibonacci/CMakeLists.txt | 4 +- third-party/tbb/include/oneapi/tbb.h | 4 +- .../tbb/include/oneapi/tbb/concurrent_queue.h | 2 +- .../tbb/detail/_concurrent_unordered_base.h | 2 +- .../tbb/include/oneapi/tbb/detail/_config.h | 7 +- .../tbb/include/oneapi/tbb/detail/_machine.h | 4 +- .../oneapi/tbb/detail/_template_helpers.h | 1 - .../tbb/include/oneapi/tbb/detail/_utils.h | 6 + .../oneapi/tbb/detail/_waitable_atomic.h | 20 +- .../oneapi/tbb/enumerable_thread_specific.h | 10 +- third-party/tbb/include/oneapi/tbb/mutex.h | 6 +- .../tbb/include/oneapi/tbb/parallel_for.h | 10 +- .../include/oneapi/tbb/parallel_for_each.h | 29 + .../tbb/include/oneapi/tbb/parallel_invoke.h | 4 +- .../tbb/include/oneapi/tbb/parallel_reduce.h | 49 +- .../tbb/include/oneapi/tbb/parallel_scan.h | 4 +- .../tbb/include/oneapi/tbb/partitioner.h | 2 +- .../include/oneapi/tbb/scalable_allocator.h | 3 +- .../tbb/include/oneapi/tbb/task_arena.h | 5 +- .../tbb/include/oneapi/tbb/task_group.h | 6 +- third-party/tbb/include/oneapi/tbb/version.h | 38 +- third-party/tbb/include/tbb/mutex.h | 17 + third-party/tbb/include/tbb/rw_mutex.h | 17 + third-party/tbb/integration/linux/env/vars.sh | 26 +- .../tbb/integration/linux/modulefiles/tbb | 99 +- .../tbb/integration/linux/modulefiles/tbb32 | 101 +- .../tbb/integration/linux/oneapi/vars.sh | 34 + .../tbb/integration/windows/env/vars.bat | 55 +- .../windows/nuget/inteltbb.devel.win.targets | 12 +- .../tbb/integration/windows/oneapi/vars.bat | 56 + third-party/tbb/python/CMakeLists.txt | 6 +- third-party/tbb/python/README.md | 2 +- third-party/tbb/python/setup.py | 2 +- third-party/tbb/python/tbb/pool.py | 48 +- third-party/tbb/src/tbb/CMakeLists.txt | 115 +- third-party/tbb/src/tbb/allocator.cpp | 4 +- third-party/tbb/src/tbb/arena.cpp | 367 +- third-party/tbb/src/tbb/arena.h | 283 +- third-party/tbb/src/tbb/arena_slot.h | 7 +- .../tbb/src/tbb/cancellation_disseminator.h | 85 + third-party/tbb/src/tbb/concurrent_monitor.h | 12 +- third-party/tbb/src/tbb/dynamic_link.cpp | 5 +- third-party/tbb/src/tbb/global_control.cpp | 113 +- third-party/tbb/src/tbb/governor.cpp | 55 +- third-party/tbb/src/tbb/governor.h | 7 +- third-party/tbb/src/tbb/main.cpp | 18 +- third-party/tbb/src/tbb/market.cpp | 631 +--- third-party/tbb/src/tbb/market.h | 287 +- third-party/tbb/src/tbb/misc.cpp | 17 +- third-party/tbb/src/tbb/misc.h | 3 +- third-party/tbb/src/tbb/misc_ex.cpp | 3 +- third-party/tbb/src/tbb/permit_manager.h | 61 + third-party/tbb/src/tbb/pm_client.h | 76 + third-party/tbb/src/tbb/rml_tbb.cpp | 4 +- third-party/tbb/src/tbb/scheduler_common.h | 35 +- third-party/tbb/src/tbb/task.cpp | 18 +- third-party/tbb/src/tbb/task_dispatcher.h | 7 +- .../tbb/src/tbb/task_group_context.cpp | 51 +- third-party/tbb/src/tbb/tbb.rc | 4 +- third-party/tbb/src/tbb/tcm.h | 173 + third-party/tbb/src/tbb/tcm_adaptor.cpp | 321 ++ third-party/tbb/src/tbb/tcm_adaptor.h | 63 + .../tbb/src/tbb/thread_control_monitor.h | 116 + third-party/tbb/src/tbb/thread_data.h | 39 +- third-party/tbb/src/tbb/thread_dispatcher.cpp | 236 ++ third-party/tbb/src/tbb/thread_dispatcher.h | 107 + .../tbb/src/tbb/thread_dispatcher_client.h | 69 + .../tbb/src/tbb/thread_request_serializer.cpp | 140 + .../tbb/src/tbb/thread_request_serializer.h | 84 + third-party/tbb/src/tbb/threading_control.cpp | 406 ++ third-party/tbb/src/tbb/threading_control.h | 154 + .../tbb/src/tbb/threading_control_client.h | 58 + third-party/tbb/src/tbb/tools_api/ittnotify.h | 255 +- .../tbb/src/tbb/tools_api/ittnotify_config.h | 80 +- .../tbb/src/tbb/tools_api/ittnotify_static.c | 208 +- .../tbb/src/tbb/tools_api/ittnotify_static.h | 15 +- .../tbb/src/tbb/tools_api/legacy/ittnotify.h | 16 +- third-party/tbb/src/tbb/waiters.h | 42 +- third-party/tbb/src/tbbbind/CMakeLists.txt | 10 - .../tbb/src/tbbbind/def/mac64-tbbbind.def | 18 + third-party/tbb/src/tbbbind/tbb_bind.cpp | 9 +- third-party/tbb/src/tbbbind/tbb_bind.rc | 4 +- third-party/tbb/src/tbbmalloc/CMakeLists.txt | 14 +- third-party/tbb/src/tbbmalloc/Synchronize.h | 6 +- .../tbb/src/tbbmalloc/TypeDefinitions.h | 4 +- third-party/tbb/src/tbbmalloc/backend.cpp | 51 +- third-party/tbb/src/tbbmalloc/backend.h | 3 +- third-party/tbb/src/tbbmalloc/frontend.cpp | 21 +- .../tbb/src/tbbmalloc/large_objects.cpp | 22 +- third-party/tbb/src/tbbmalloc/large_objects.h | 7 +- third-party/tbb/src/tbbmalloc/tbbmalloc.cpp | 6 +- third-party/tbb/src/tbbmalloc/tbbmalloc.rc | 4 +- .../tbb/src/tbbmalloc/tbbmalloc_internal.h | 18 +- .../tbb/src/tbbmalloc_proxy/CMakeLists.txt | 12 +- .../src/tbbmalloc_proxy/tbbmalloc_proxy.rc | 4 +- third-party/tbb/test/CMakeLists.txt | 139 +- .../test/common/common_arena_constraints.h | 11 +- .../tbb/test/common/concurrency_tracker.h | 4 +- .../common/concurrent_associative_common.h | 4 +- third-party/tbb/test/common/cpu_usertime.h | 4 +- third-party/tbb/test/common/doctest.h | 3 + .../tbb/test/common/utils_concurrency_limit.h | 55 +- .../tbb/test/common/utils_dynamic_libs.h | 19 +- .../conformance_blocked_rangeNd.cpp | 4 +- .../conformance_concurrent_hash_map.cpp | 6 +- .../conformance_concurrent_queue.cpp | 4 +- .../conformance_concurrent_vector.cpp | 6 +- .../test/conformance/conformance_flowgraph.h | 21 +- .../conformance_global_control.cpp | 37 +- .../conformance/conformance_parallel_for.cpp | 4 +- .../conformance_parallel_for_each.cpp | 2 - .../conformance_parallel_reduce.cpp | 162 +- .../tbb/test/tbb/test_arena_constraints.cpp | 6 +- .../tbb/test/tbb/test_arena_priorities.cpp | 3 +- third-party/tbb/test/tbb/test_async_node.cpp | 6 +- .../tbb/test/tbb/test_broadcast_node.cpp | 6 +- third-party/tbb/test/tbb/test_buffer_node.cpp | 4 +- .../test/tbb/test_collaborative_call_once.cpp | 7 +- .../tbb/test/tbb/test_concurrent_hash_map.cpp | 6 +- .../tbb/test/tbb/test_concurrent_queue.cpp | 3 +- .../tbb/test/tbb/test_continue_node.cpp | 4 +- .../tbb/test/tbb/test_eh_algorithms.cpp | 376 +- .../tbb/test/tbb/test_eh_flow_graph.cpp | 7 +- third-party/tbb/test/tbb/test_eh_thread.cpp | 4 +- .../test/tbb/test_flow_graph_priorities.cpp | 5 +- third-party/tbb/test/tbb/test_fuzzing.cpp | 41 + .../tbb/test/tbb/test_global_control.cpp | 25 +- .../tbb/test/tbb/test_join_node_preview.cpp | 4 +- .../tbb/test/tbb/test_limiter_node.cpp | 4 +- third-party/tbb/test/tbb/test_mutex.cpp | 9 +- .../tbb/test/tbb/test_parallel_for_each.cpp | 163 +- .../tbb/test/tbb/test_parallel_invoke.cpp | 5 +- third-party/tbb/test/tbb/test_partitioner.cpp | 5 +- .../tbb/test/tbb/test_resumable_tasks.cpp | 6 +- .../tbb/test/tbb/test_scheduler_mix.cpp | 27 +- third-party/tbb/test/tbb/test_task.cpp | 22 +- third-party/tbb/test/tbb/test_task_arena.cpp | 108 +- third-party/tbb/test/tbb/test_task_group.cpp | 109 +- third-party/tbb/test/tbb/test_tbb_header.cpp | 4 +- .../test/tbbmalloc/test_malloc_compliance.cpp | 7 +- .../test/tbbmalloc/test_malloc_whitebox.cpp | 10 +- third-party/tbb/third-party-programs.txt | 413 +- third-party/zlib/.github/workflows/cmake.yml | 89 - .../zlib/.github/workflows/configure.yml | 136 - third-party/zlib/.github/workflows/fuzz.yml | 25 - third-party/zlib/.gitignore | 26 - third-party/zlib/CMakeLists.txt | 31 +- third-party/zlib/ChangeLog | 10 + third-party/zlib/FAQ | 3 +- third-party/zlib/Makefile.in | 16 +- third-party/zlib/README | 6 +- third-party/zlib/configure | 9 +- third-party/zlib/contrib/delphi/ZLib.pas | 2 +- third-party/zlib/contrib/infback9/inftree9.c | 6 +- third-party/zlib/contrib/infback9/inftree9.h | 4 +- third-party/zlib/contrib/iostream3/zfstream.h | 4 +- third-party/zlib/contrib/minizip/Makefile | 2 +- third-party/zlib/contrib/minizip/configure.ac | 2 +- third-party/zlib/contrib/minizip/ioapi.h | 2 +- third-party/zlib/contrib/minizip/miniunz.c | 18 +- third-party/zlib/contrib/minizip/unzip.c | 8 +- third-party/zlib/contrib/minizip/unzip.h | 2 +- third-party/zlib/contrib/minizip/zip.c | 21 +- third-party/zlib/contrib/minizip/zip.h | 4 +- third-party/zlib/contrib/nuget/nuget.csproj | 43 + third-party/zlib/contrib/nuget/nuget.sln | 22 + third-party/zlib/contrib/pascal/zlibpas.pas | 2 +- third-party/zlib/contrib/puff/puff.c | 8 +- third-party/zlib/contrib/vstudio/readme.txt | 156 +- third-party/zlib/contrib/vstudio/vc10/zlib.rc | 8 +- .../zlib/contrib/vstudio/vc10/zlibvc.def | 2 +- third-party/zlib/contrib/vstudio/vc11/zlib.rc | 8 +- .../zlib/contrib/vstudio/vc11/zlibvc.def | 2 +- third-party/zlib/contrib/vstudio/vc12/zlib.rc | 8 +- .../zlib/contrib/vstudio/vc12/zlibvc.def | 2 +- third-party/zlib/contrib/vstudio/vc14/zlib.rc | 8 +- .../zlib/contrib/vstudio/vc14/zlibvc.def | 2 +- .../zlib/contrib/vstudio/vc17/miniunz.vcxproj | 409 ++ .../zlib/contrib/vstudio/vc17/minizip.vcxproj | 405 ++ .../contrib/vstudio/vc17/testzlib.vcxproj | 473 +++ .../contrib/vstudio/vc17/testzlibdll.vcxproj | 409 ++ third-party/zlib/contrib/vstudio/vc17/zlib.rc | 32 + .../contrib/vstudio/vc17/zlibstat.vcxproj | 602 +++ .../zlib/contrib/vstudio/vc17/zlibvc.def | 158 + .../zlib/contrib/vstudio/vc17/zlibvc.sln | 179 + .../zlib/contrib/vstudio/vc17/zlibvc.vcxproj | 875 +++++ third-party/zlib/contrib/vstudio/vc9/zlib.rc | 8 +- .../zlib/contrib/vstudio/vc9/zlibvc.def | 2 +- third-party/zlib/deflate.c | 47 +- third-party/zlib/deflate.h | 35 +- third-party/zlib/doc/algorithm.txt | 2 +- third-party/zlib/examples/gzlog.c | 4 +- third-party/zlib/examples/zran.c | 2 +- third-party/zlib/gzguts.h | 8 +- third-party/zlib/gzlib.c | 12 +- third-party/zlib/inflate.c | 2 +- third-party/zlib/inftrees.c | 6 +- third-party/zlib/inftrees.h | 4 +- third-party/zlib/old/visual-basic.txt | 2 +- third-party/zlib/os400/README400 | 2 +- third-party/zlib/os400/zlib.inc | 6 +- third-party/zlib/qnx/package.qpg | 10 +- third-party/zlib/test/example.c | 25 +- third-party/zlib/test/minigzip.c | 32 +- third-party/zlib/treebuild.xml | 4 +- third-party/zlib/trees.c | 20 +- third-party/zlib/win32/DLL_FAQ.txt | 20 +- third-party/zlib/win32/README-WIN32.txt | 8 +- third-party/zlib/zconf.h.cmakein | 10 +- third-party/zlib/zconf.h.in | 10 +- third-party/zlib/zconf.h.included | 10 +- third-party/zlib/zlib.3 | 6 +- third-party/zlib/zlib.3.pdf | Bin 19505 -> 25523 bytes third-party/zlib/zlib.h | 22 +- third-party/zlib/zlib.map | 200 +- third-party/zlib/zutil.h | 27 +- third-party/zstd/.cirrus.yml | 4 +- third-party/zstd/.github/workflows/commit.yml | 89 + .../zstd/.github/workflows/dev-long-tests.yml | 88 +- .../.github/workflows/dev-short-tests.yml | 161 +- .../zstd/.github/workflows/nightly.yml | 65 + .../workflows/publish-release-artifacts.yml | 2 +- .../zstd/.github/workflows/scorecards.yml | 8 +- .../.github/workflows/windows-artifacts.yml | 29 +- third-party/zstd/.gitignore | 8 +- third-party/zstd/.travis.yml | 128 - third-party/zstd/CHANGELOG | 41 +- third-party/zstd/CONTRIBUTING.md | 6 +- third-party/zstd/Makefile | 54 +- third-party/zstd/README.md | 8 +- third-party/zstd/SECURITY.md | 15 + third-party/zstd/appveyor.yml | 205 - .../zstd/build/VS2008/zstd/zstd.vcproj | 4 + .../zstd/build/VS2010/datagen/datagen.vcxproj | 2 + .../zstd/build/VS2010/zstd/zstd.vcxproj | 1 + third-party/zstd/build/cmake/CMakeLists.txt | 66 +- third-party/zstd/build/cmake/README.md | 32 + .../build/cmake/contrib/pzstd/CMakeLists.txt | 1 + .../zstd/build/cmake/lib/CMakeLists.txt | 122 +- .../zstd/build/cmake/programs/CMakeLists.txt | 11 +- .../zstd/build/cmake/tests/CMakeLists.txt | 12 +- third-party/zstd/build/cmake/zstdConfig.cmake | 1 - .../zstd/build/cmake/zstdConfig.cmake.in | 10 + .../zstd/build/meson/programs/meson.build | 1 + .../zstd/build/meson/tests/meson.build | 4 +- third-party/zstd/contrib/linux-kernel/mem.h | 1 + .../linux-kernel/zstd_decompress_module.c | 2 +- .../zstd/contrib/linux-kernel/zstd_deps.h | 6 +- third-party/zstd/contrib/pzstd/Makefile | 15 +- .../examples/parallel_processing.c | 2 +- third-party/zstd/doc/decompressor_errata.md | 66 +- .../zstd/doc/decompressor_permissive.md | 60 + .../doc/educational_decoder/zstd_decompress.c | 27 +- .../zstd/doc/zstd_compression_format.md | 37 +- third-party/zstd/doc/zstd_manual.html | 129 +- .../zstd/examples/streaming_compression.c | 10 +- third-party/zstd/lib/Makefile | 68 +- third-party/zstd/lib/README.md | 15 +- third-party/zstd/lib/common/allocations.h | 2 +- third-party/zstd/lib/common/bitstream.h | 78 +- third-party/zstd/lib/common/compiler.h | 136 +- third-party/zstd/lib/common/cpu.h | 36 + third-party/zstd/lib/common/debug.c | 6 + third-party/zstd/lib/common/debug.h | 31 +- third-party/zstd/lib/common/error_private.h | 81 +- third-party/zstd/lib/common/fse.h | 5 +- third-party/zstd/lib/common/fse_decompress.c | 36 +- third-party/zstd/lib/common/huf.h | 15 +- third-party/zstd/lib/common/mem.h | 9 - third-party/zstd/lib/common/pool.c | 2 +- third-party/zstd/lib/common/pool.h | 2 +- .../zstd/lib/common/portability_macros.h | 2 + third-party/zstd/lib/common/threading.c | 10 +- third-party/zstd/lib/common/xxhash.c | 16 +- third-party/zstd/lib/common/xxhash.h | 3348 ++++++++++++----- third-party/zstd/lib/common/zstd_internal.h | 10 +- third-party/zstd/lib/compress/fse_compress.c | 15 +- third-party/zstd/lib/compress/huf_compress.c | 79 +- third-party/zstd/lib/compress/zstd_compress.c | 443 ++- .../lib/compress/zstd_compress_internal.h | 56 +- .../lib/compress/zstd_compress_superblock.c | 337 +- third-party/zstd/lib/compress/zstd_cwksp.h | 32 +- .../zstd/lib/compress/zstd_double_fast.c | 22 +- .../zstd/lib/compress/zstd_double_fast.h | 11 + third-party/zstd/lib/compress/zstd_fast.c | 20 +- third-party/zstd/lib/compress/zstd_lazy.c | 216 +- third-party/zstd/lib/compress/zstd_lazy.h | 131 +- third-party/zstd/lib/compress/zstd_ldm.c | 10 +- third-party/zstd/lib/compress/zstd_opt.c | 328 +- third-party/zstd/lib/compress/zstd_opt.h | 38 +- .../zstd/lib/compress/zstdmt_compress.c | 173 +- .../zstd/lib/decompress/huf_decompress.c | 314 +- .../lib/decompress/huf_decompress_amd64.S | 57 +- .../zstd/lib/decompress/zstd_decompress.c | 116 +- .../lib/decompress/zstd_decompress_block.c | 439 ++- .../lib/decompress/zstd_decompress_block.h | 2 +- .../lib/decompress/zstd_decompress_internal.h | 2 + third-party/zstd/lib/dictBuilder/cover.c | 28 +- third-party/zstd/lib/dictBuilder/cover.h | 10 +- third-party/zstd/lib/dictBuilder/fastcover.c | 4 +- third-party/zstd/lib/dictBuilder/zdict.c | 18 +- third-party/zstd/lib/legacy/zstd_legacy.h | 30 + third-party/zstd/lib/legacy/zstd_v01.c | 2 + third-party/zstd/lib/legacy/zstd_v02.c | 20 +- third-party/zstd/lib/legacy/zstd_v03.c | 20 +- third-party/zstd/lib/legacy/zstd_v04.c | 15 +- third-party/zstd/lib/legacy/zstd_v05.c | 1 + third-party/zstd/lib/legacy/zstd_v06.c | 11 +- third-party/zstd/lib/legacy/zstd_v07.c | 12 +- third-party/zstd/lib/libzstd.mk | 45 +- third-party/zstd/lib/zstd.h | 189 +- third-party/zstd/programs/.gitignore | 2 + third-party/zstd/programs/Makefile | 13 +- third-party/zstd/programs/benchfn.c | 4 +- third-party/zstd/programs/benchzstd.c | 1188 +++--- third-party/zstd/programs/benchzstd.h | 12 +- third-party/zstd/programs/fileio.c | 134 +- third-party/zstd/programs/fileio_asyncio.c | 8 +- third-party/zstd/programs/lorem.c | 285 ++ third-party/zstd/programs/lorem.h | 32 + third-party/zstd/programs/platform.h | 9 +- third-party/zstd/programs/util.c | 38 +- third-party/zstd/programs/util.h | 2 +- third-party/zstd/programs/zstd.1 | 489 ++- third-party/zstd/programs/zstd.1.md | 359 +- third-party/zstd/programs/zstdcli.c | 64 +- third-party/zstd/programs/zstdgrep.1 | 13 +- third-party/zstd/programs/zstdless.1 | 9 +- third-party/zstd/tests/.gitignore | 3 + third-party/zstd/tests/Makefile | 101 +- .../zstd/tests/cli-tests/basic/args.sh | 10 + .../zstd/tests/cli-tests/basic/args.sh.exit | 1 + .../tests/cli-tests/basic/args.sh.stderr.glob | 28 + .../cli-tests/decompression/detectErrors.sh | 11 + .../file-handling/directory-mirror.sh | 49 + .../directory-mirror.sh.stderr.exact | 0 .../directory-mirror.sh.stdout.exact | 0 third-party/zstd/tests/datagencli.c | 189 +- third-party/zstd/tests/decodecorpus.c | 4 +- third-party/zstd/tests/fullbench.c | 28 +- third-party/zstd/tests/fuzz/Makefile | 65 +- third-party/zstd/tests/fuzz/README.md | 42 + .../zstd/tests/fuzz/decompress_cross_format.c | 130 + .../zstd/tests/fuzz/dictionary_round_trip.c | 12 +- third-party/zstd/tests/fuzz/fuzz.py | 18 +- .../zstd/tests/fuzz/fuzz_data_producer.c | 10 +- .../tests/fuzz/fuzz_third_party_seq_prod.h | 4 +- .../zstd/tests/fuzz/generate_sequences.c | 88 + .../zstd/tests/fuzz/regression_driver.c | 3 +- .../tests/fuzz/sequence_compression_api.c | 2 +- .../zstd/tests/fuzz/simple_decompress.c | 20 +- .../zstd/tests/fuzz/simple_round_trip.c | 16 +- .../zstd/tests/fuzz/stream_round_trip.c | 24 +- third-party/zstd/tests/fuzzer.c | 83 +- .../golden-decompression-errors/.gitignore | 1 + .../golden-decompression-errors/off0.bin.zst | Bin 0 -> 17 bytes .../zeroSeq_extraneous.zst | Bin 0 -> 27 bytes .../tests/golden-decompression/block-128k.zst | Bin 0 -> 131081 bytes .../golden-decompression/empty-block.zst | Bin 0 -> 11 bytes .../golden-decompression/rle-first-block.zst | Bin 0 -> 45 bytes .../tests/golden-decompression/zeroSeq_2B.zst | Bin 0 -> 25 bytes third-party/zstd/tests/loremOut.c | 50 + third-party/zstd/tests/loremOut.h | 15 + third-party/zstd/tests/playTests.sh | 216 +- third-party/zstd/tests/regression/results.csv | 264 +- third-party/zstd/tests/zstreamtest.c | 181 +- .../zstd/zlibWrapper/examples/example.c | 67 +- .../zlibWrapper/examples/example_original.c | 49 +- .../zstd/zlibWrapper/examples/minigzip.c | 67 +- third-party/zstd/zlibWrapper/gzclose.c | 4 +- third-party/zstd/zlibWrapper/gzlib.c | 93 +- third-party/zstd/zlibWrapper/gzread.c | 75 +- third-party/zstd/zlibWrapper/gzwrite.c | 76 +- 1052 files changed, 35106 insertions(+), 16933 deletions(-) create mode 100644 .github/workflows/build-all.yml create mode 100644 .github/workflows/build-x86.yml delete mode 100644 .github/workflows/linux-packages.yml create mode 100644 .github/workflows/update-manpage.yml delete mode 100644 common/cmdline.h delete mode 100644 common/integers.h delete mode 100644 common/main.cc delete mode 100644 common/output-file-unix.h delete mode 100644 common/output-file-win32.h delete mode 100644 common/output-file.h delete mode 100644 common/uuid.cc create mode 100644 docs/chart.svg delete mode 100644 docs/comparison.png delete mode 100644 elf/arch-alpha.cc delete mode 100644 elf/input-sections.cc delete mode 100644 elf/linker-script.cc delete mode 100644 elf/lto.cc rename {common => lib}/archive-file.h (92%) rename {common => lib}/common.h (66%) rename {common => lib}/compress.cc (96%) rename {common => lib}/config.h.in (75%) create mode 100644 lib/crc32.cc rename {common => lib}/demangle.cc (54%) rename {common => lib}/filepath.cc (51%) rename {test => lib}/gentoo-test.sh (86%) rename {common => lib}/glob.cc (92%) rename {common => lib}/hyperloglog.cc (100%) create mode 100644 lib/integers.h rename elf/jobs.cc => lib/jobs-unix.cc (62%) create mode 100644 lib/jobs-win32.cc create mode 100644 lib/malloc.cc create mode 100644 lib/mapped-file-unix.cc create mode 100644 lib/mapped-file-win32.cc rename {common => lib}/multi-glob.cc (72%) rename {common => lib}/perf.cc (92%) create mode 100644 lib/random.cc create mode 100644 lib/signal-unix.cc create mode 100644 lib/signal-win32.cc create mode 100644 lib/siphash.h rename {common => lib}/tar.cc (70%) rename {common => lib}/update-git-hash.cmake (100%) rename {elf => src}/arch-arm32.cc (88%) rename {elf => src}/arch-arm64.cc (95%) rename {elf => src}/arch-i386.cc (85%) rename {elf => src}/arch-loongarch.cc (53%) rename {elf => src}/arch-m68k.cc (95%) rename {elf => src}/arch-ppc32.cc (95%) rename {elf => src}/arch-ppc64v1.cc (95%) rename {elf => src}/arch-ppc64v2.cc (76%) rename {elf => src}/arch-riscv.cc (77%) rename {elf => src}/arch-s390x.cc (94%) rename {elf => src}/arch-sh4.cc (95%) rename {elf => src}/arch-sparc64.cc (84%) rename {elf => src}/arch-x86-64.cc (74%) rename {elf => src}/cmdline.cc (77%) create mode 100644 src/config.cc rename {elf => src}/elf.cc (96%) rename {elf => src}/elf.h (94%) rename {common => src}/filetype.h (61%) rename {elf => src}/gc-sections.cc (86%) rename {elf => src}/gdb-index.cc (98%) rename {elf => src}/icf.cc (91%) rename {elf => src}/input-files.cc (75%) create mode 100644 src/input-sections.cc create mode 100644 src/linker-script.cc rename {elf => src}/lto-unix.cc (78%) rename {elf => src}/lto-win32.cc (57%) rename {elf => src}/lto.h (97%) rename {elf => src}/main.cc (66%) rename {elf => src}/mapfile.cc (96%) rename {elf => src}/mold-wrapper.c (87%) rename {elf => src}/mold.h (82%) rename {elf => src}/output-chunks.cc (75%) create mode 100644 src/output-file-unix.cc create mode 100644 src/output-file-win32.cc rename {elf => src}/passes.cc (69%) rename {elf => src}/relocatable.cc (96%) create mode 100644 src/shrink-sections.cc rename elf/subprocess.cc => src/subprocess-unix.cc (88%) create mode 100644 src/subprocess-win32.cc rename {elf => src}/thunks.cc (83%) rename {elf => src}/tls.cc (88%) rename test/{elf => }/CMakeLists.txt (65%) rename test/{elf => }/abs-error.sh (94%) rename test/{elf => }/absolute-symbols.sh (93%) rename test/{elf => }/allow-multiple-definition.sh (100%) rename test/{elf => }/ar-alignment.sh (100%) rename test/{elf/aarch64_range-extension-thunk-disassembly.sh => arch-aarch64-range-extension-thunk-disassembly.sh} (92%) rename test/{elf/aarch64_variant-pcs.sh => arch-aarch64-variant-pcs.sh} (93%) create mode 100755 test/arch-arm-abs-error.sh rename test/{elf/arm_range-extension-thunk-disassembly.sh => arch-arm-range-extension-thunk-disassembly.sh} (94%) rename test/{elf/arm_range-extension-thunk.sh => arch-arm-range-extension-thunk.sh} (97%) rename test/{elf/arm_thumb-interwork.sh => arch-arm-thumb-interwork.sh} (93%) rename test/{elf/arm_tlsdesc.sh => arch-arm-tlsdesc.sh} (98%) rename test/{elf/i386_tls-module-base.sh => arch-i686-tls-module-base.sh} (97%) create mode 100755 test/arch-i686-tlsdesc.sh rename test/{elf/loongarch64_mcmodel-extreme.sh => arch-loongarch64-mcmodel-extreme.sh} (100%) create mode 100755 test/arch-loongarch64-relax-call36.sh create mode 100755 test/arch-loongarch64-relax-got-load.sh create mode 100755 test/arch-loongarch64-relax-pcala-addi.sh create mode 100755 test/arch-loongarch64-relax-tlsdesc.sh rename test/{elf/package-metadata.sh => arch-ppc64le-save-restore-gprs.sh} (53%) rename test/{elf/riscv64_attributes.sh => arch-riscv64-attributes.sh} (100%) rename test/{elf/riscv64_attributes2.sh => arch-riscv64-attributes2.sh} (100%) create mode 100755 test/arch-riscv64-global-pointer-dso.sh create mode 100755 test/arch-riscv64-global-pointer.sh rename test/{elf/riscv64_norvc.sh => arch-riscv64-norvc.sh} (91%) rename test/{elf/riscv64_obj-compatible.sh => arch-riscv64-obj-compatible.sh} (100%) create mode 100755 test/arch-riscv64-relax-got.sh rename test/{elf/riscv64_relax-hi20.sh => arch-riscv64-relax-hi20.sh} (92%) rename test/{elf/riscv64_weak-undef.sh => arch-riscv64-weak-undef.sh} (100%) rename test/{elf/s390x_got.sh => arch-s390x-got.sh} (73%) create mode 100755 test/arch-x86_64-address-equality.sh rename test/{elf/x86_64_empty-mergeable-section.sh => arch-x86_64-empty-mergeable-section.sh} (100%) rename test/{elf/x86_64_emulation-deduction.sh => arch-x86_64-emulation-deduction.sh} (73%) rename test/{elf/x86_64_exception-mcmodel-large.sh => arch-x86_64-exception-mcmodel-large.sh} (73%) rename test/{elf/x86_64_execstack-if-needed.sh => arch-x86_64-execstack-if-needed.sh} (92%) rename test/{elf/x86_64_gnu-linkonce.sh => arch-x86_64-gnu-linkonce.sh} (95%) rename test/{elf/x86_64_gnu-retain.sh => arch-x86_64-gnu-retain.sh} (94%) rename test/{elf/x86_64_gotpcrelx.sh => arch-x86_64-gotpcrelx.sh} (100%) rename test/{elf/x86_64_ifunc-alias.sh => arch-x86_64-ifunc-alias.sh} (100%) create mode 100755 test/arch-x86_64-incompatible-libs-linker-script.sh create mode 100755 test/arch-x86_64-incompatible-libs-linker-script2.sh rename test/{elf/x86_64_incompatible-libs.sh => arch-x86_64-incompatible-libs.sh} (96%) rename test/{elf/x86_64_incompatible-libs2.sh => arch-x86_64-incompatible-libs2.sh} (96%) rename test/{elf/x86_64_incompatible-obj.sh => arch-x86_64-incompatible-obj.sh} (91%) rename test/{elf/x86_64_init-array-readonly.sh => arch-x86_64-init-array-readonly.sh} (95%) rename test/{elf/x86_64_init-array.sh => arch-x86_64-init-array.sh} (95%) create mode 100755 test/arch-x86_64-isa-level.sh rename test/{elf/x86_64_large-bss.sh => arch-x86_64-large-bss.sh} (92%) rename test/{elf/x86_64_mergeable-records.sh => arch-x86_64-mergeable-records.sh} (93%) create mode 100755 test/arch-x86_64-mergeable-strings-nonalloc.sh rename test/{elf/x86_64_mergeable-strings.sh => arch-x86_64-mergeable-strings.sh} (89%) rename test/{elf/x86_64_note-property.sh => arch-x86_64-note-property.sh} (89%) rename test/{elf/x86_64_note-property2.sh => arch-x86_64-note-property2.sh} (97%) rename test/{elf/x86_64_note.sh => arch-x86_64-note.sh} (91%) rename test/{elf/x86_64_note2.sh => arch-x86_64-note2.sh} (89%) rename test/{elf/x86_64_plt.sh => arch-x86_64-plt.sh} (93%) rename test/{elf/x86_64_preinit-array.sh => arch-x86_64-preinit-array.sh} (95%) rename test/{elf/x86_64_relax.sh => arch-x86_64-relax.sh} (98%) rename test/{elf/x86_64_reloc-overflow.sh => arch-x86_64-reloc-overflow.sh} (88%) rename test/{elf/x86_64_reloc-zero.sh => arch-x86_64-reloc-zero.sh} (86%) rename test/{elf/x86_64_reloc.sh => arch-x86_64-reloc.sh} (99%) rename test/{elf/x86_64_section-alignment.sh => arch-x86_64-section-alignment.sh} (95%) rename test/{elf/x86_64_section-name.sh => arch-x86_64-section-name.sh} (98%) create mode 100755 test/arch-x86_64-tbss-only.sh rename test/{elf/x86_64_tls-gd-mcmodel-large.sh => arch-x86_64-tls-gd-mcmodel-large.sh} (97%) rename test/{elf/x86_64_tls-gd-to-ie.sh => arch-x86_64-tls-gd-to-ie.sh} (100%) rename test/{elf/x86_64_tls-large-tbss.sh => arch-x86_64-tls-large-tbss.sh} (89%) rename test/{elf/x86_64_tls-ld-mcmodel-large.sh => arch-x86_64-tls-ld-mcmodel-large.sh} (95%) rename test/{elf/x86_64_tls-module-base.sh => arch-x86_64-tls-module-base.sh} (96%) create mode 100755 test/arch-x86_64-tlsdesc.sh rename test/{elf/x86_64_unique.sh => arch-x86_64-unique.sh} (94%) rename test/{elf/x86_64_warn-execstack.sh => arch-x86_64-warn-execstack.sh} (60%) rename test/{elf/x86_64_warn-shared-textrel.sh => arch-x86_64-warn-shared-textrel.sh} (88%) rename test/{elf/x86_64_warn-textrel.sh => arch-x86_64-warn-textrel.sh} (87%) rename test/{elf/x86_64_z-ibt.sh => arch-x86_64-z-ibt.sh} (92%) rename test/{elf/x86_64_z-ibtplt.sh => arch-x86_64-z-ibtplt.sh} (93%) rename test/{elf/x86_64_endbr.sh => arch-x86_64-z-rewrite-endbr.sh} (95%) create mode 100755 test/arch-x86_64-z-rewrite-endbr2.sh create mode 100755 test/arch-x86_64-z-rewrite-endbr3.sh rename test/{elf/x86_64_z-shstk.sh => arch-x86_64-z-shstk.sh} (91%) rename test/{elf/x86_64_z-text.sh => arch-x86_64-z-text.sh} (89%) rename test/{elf => }/as-needed-dso.sh (93%) rename test/{elf => }/as-needed-dso2.sh (100%) rename test/{elf => }/as-needed-weak.sh (77%) rename test/{elf => }/as-needed.sh (60%) rename test/{elf => }/auxiliary.sh (100%) rename test/{elf => }/bno-symbolic.sh (97%) rename test/{elf => }/bsymbolic-functions.sh (100%) create mode 100755 test/bsymbolic-non-weak-functions.sh create mode 100755 test/bsymbolic-non-weak.sh rename test/{elf => }/bsymbolic.sh (100%) rename test/{elf => }/build-id.sh (87%) rename test/{elf => }/canonical-plt.sh (98%) rename test/{elf => }/cmdline.sh (100%) rename test/{elf => }/color-diagnostics.sh (100%) rename test/{elf => }/comment.sh (78%) rename test/{elf => }/common-archive.sh (100%) rename test/{elf => }/common-ref.sh (100%) rename test/{elf/common.sh => common-symbols.sh} (100%) rename test/{elf => }/common.inc (64%) rename test/{elf => }/compress-debug-sections-zstd.sh (100%) rename test/{elf => }/compress-debug-sections.sh (100%) rename test/{elf => }/compressed-debug-info.sh (100%) rename test/{elf => }/copyrel-alignment.sh (96%) create mode 100755 test/copyrel-norelro.sh rename test/{elf => }/copyrel-protected.sh (80%) rename test/{elf => }/copyrel-relro.sh (100%) rename test/{elf => }/copyrel-relro2.sh (100%) rename test/{elf => }/copyrel.sh (100%) rename test/{elf => }/ctors-in-init-array.sh (100%) rename test/{elf => }/dead-debug-sections.sh (100%) rename test/{elf => }/debug-macro-section.sh (100%) rename test/{elf => }/default-symver.sh (100%) rename test/{elf => }/defsym-lto.sh (79%) rename test/{elf => }/defsym-missing-symbol.sh (100%) rename test/{elf => }/defsym.sh (100%) rename test/{elf => }/defsym2.sh (100%) create mode 100755 test/demangle-cpp.sh rename test/{elf => }/demangle-rust.sh (100%) rename test/{elf => }/demangle.sh (100%) create mode 100755 test/dependency-file-response-file.sh rename test/{elf => }/dependency-file.sh (100%) rename test/{elf => }/disable-new-dtags.sh (100%) rename test/{elf => }/discard.sh (91%) rename test/{elf => }/dso-undef.sh (100%) rename test/{elf => }/dt-init.sh (100%) rename test/{elf => }/dt-needed.sh (100%) rename test/{elf => }/duplicate-error-archive.sh (100%) rename test/{elf => }/duplicate-error.sh (100%) rename test/{elf => }/dynamic-dt-debug.sh (100%) rename test/{elf => }/dynamic-linker.sh (100%) create mode 100755 test/dynamic-list-data.sh rename test/{elf => }/dynamic-list.sh (100%) rename test/{elf => }/dynamic-list2.sh (100%) rename test/{elf => }/dynamic-list3.sh (100%) rename test/{elf => }/dynamic-list4.sh (100%) rename test/{elf => }/dynamic.sh (84%) delete mode 100755 test/elf/mold-wrapper2.sh delete mode 100755 test/elf/now.sh delete mode 100755 test/elf/pack-dyn-relocs-relr.sh delete mode 100755 test/elf/relocatable-no-ehframe.sh delete mode 100755 test/elf/run.sh delete mode 100755 test/elf/shared-abs-sym.sh delete mode 100755 test/elf/z-pack-relative-relocs.sh delete mode 100755 test/elf/z-start-stop-visibility.sh rename test/{elf => }/emit-relocs-cpp.sh (100%) rename test/{elf => }/emit-relocs-dead-sections.sh (100%) rename test/{elf => }/emit-relocs.sh (100%) create mode 100755 test/empty-arg.sh rename test/{elf => }/empty-file.sh (100%) rename test/{elf => }/empty-input.sh (100%) rename test/{elf => }/empty-version.sh (100%) rename test/{elf => }/entry.sh (100%) create mode 100755 test/exception-multiple-ehframe.sh rename test/{elf => }/exception.sh (93%) rename test/{elf => }/exclude-libs.sh (84%) rename test/{elf => }/exclude-libs2.sh (100%) rename test/{elf => }/exclude-libs3.sh (100%) rename test/{elf => }/execstack.sh (100%) rename test/{elf => }/execute-only.sh (94%) rename test/{elf => }/export-dynamic.sh (100%) rename test/{elf => }/export-from-exe.sh (100%) rename test/{elf => }/fatal-warnings.sh (100%) rename test/{elf => }/filler.sh (100%) rename test/{elf => }/filter.sh (100%) rename test/{elf => }/func-addr.sh (100%) rename test/{elf => }/gc-sections.sh (100%) rename test/{elf => }/gdb-index-compress-output.sh (97%) rename test/{elf => }/gdb-index-dwarf2.sh (97%) rename test/{elf => }/gdb-index-dwarf3.sh (97%) rename test/{elf => }/gdb-index-dwarf4.sh (97%) rename test/{elf => }/gdb-index-dwarf5.sh (85%) rename test/{elf => }/gdb-index-dwarf64.sh (98%) rename test/{elf => }/gdb-index-empty.sh (100%) rename test/{elf => }/gdb-index-split-dwarf.sh (97%) rename test/{elf => }/glibc-2.22-bug.sh (94%) rename test/{elf => }/global-offset-table.sh (100%) rename test/{elf => }/gnu-hash.sh (100%) create mode 100755 test/gnu-property.sh create mode 100755 test/gnu-retain.sh rename test/{elf => }/gnu-unique.sh (100%) rename test/{elf => }/gnu-warning.sh (100%) rename test/{elf => }/hash-style.sh (100%) rename test/{elf => }/hello-dynamic.sh (100%) rename test/{elf => }/hello-static.sh (100%) rename test/{elf => }/help.sh (100%) create mode 100755 test/hidden-archive.sh rename test/{elf => }/hidden-undef.sh (100%) rename test/{elf => }/hidden-weak-undef.sh (100%) rename test/{elf => }/icf-safe.sh (100%) rename test/{elf => }/icf-small.sh (100%) rename test/{elf => }/icf.sh (100%) rename test/{elf => }/ifunc-address-equality-exported.sh (100%) rename test/{elf => }/ifunc-address-equality.sh (100%) rename test/{elf => }/ifunc-alias.sh (100%) rename test/{elf => }/ifunc-dlopen.sh (100%) rename test/{elf => }/ifunc-dso.sh (100%) rename test/{elf => }/ifunc-dynamic.sh (100%) rename test/{elf => }/ifunc-export.sh (100%) rename test/{elf => }/ifunc-funcptr.sh (100%) rename test/{elf => }/ifunc-noplt.sh (100%) rename test/{elf => }/ifunc-static-pie.sh (100%) rename test/{elf => }/ifunc-static.sh (100%) rename test/{elf => }/image-base.sh (100%) rename test/{elf => }/init-array-priorities.sh (100%) rename test/{elf => }/init-in-dso.sh (100%) rename test/{elf => }/init.sh (100%) rename test/{elf => }/initfirst.sh (100%) rename test/{elf => }/interpose.sh (100%) rename test/{elf => }/invalid-version-script.sh (100%) rename test/{elf => }/issue646.sh (89%) rename test/{elf => }/large-alignment-dso.sh (100%) rename test/{elf => }/large-alignment.sh (100%) rename test/{elf => }/large-max-page-size-strip.sh (100%) rename test/{elf => }/large-max-page-size.sh (100%) rename test/{elf => }/large-text.sh (100%) create mode 100755 test/library.sh rename test/{elf => }/link-order.sh (100%) rename test/{elf => }/linker-script-defsym.sh (100%) create mode 100755 test/linker-script-error.sh rename test/{elf => }/linker-script-relocatable.sh (100%) rename test/{elf => }/linker-script.sh (100%) rename test/{elf => }/linker-script2.sh (100%) rename test/{elf => }/linker-script3.sh (100%) rename test/{elf => }/linker-script4.sh (100%) create mode 100755 test/linker-script5.sh create mode 100755 test/linker-script6.sh rename test/{elf => }/lto-archive.sh (87%) create mode 100755 test/lto-archive2.sh rename test/{elf => }/lto-dso.sh (80%) rename test/{elf => }/lto-gcc.sh (91%) rename test/{elf => }/lto-llvm.sh (78%) rename test/{elf => }/lto-nostdlib.sh (87%) rename test/{elf => }/lto-version-script.sh (95%) rename test/{elf => }/main-in-dso.sh (100%) rename test/{elf => }/many-sections.sh (100%) rename test/{elf => }/many-sections2.sh (82%) rename test/{elf => }/mergeable-strings.sh (100%) rename test/{elf => }/missing-but-ok.sh (100%) rename test/{elf => }/missing-error.sh (100%) rename test/{elf => }/mold-wrapper.sh (98%) create mode 100755 test/mold-wrapper2.sh create mode 100755 test/nmagic.sh create mode 100755 test/no-allow-shlib-undefined.sh rename test/{elf => }/no-eh-frame-header.sh (100%) rename test/{elf/bug178.sh => no-object-file.sh} (100%) rename test/{elf => }/no-quick-exit.sh (100%) rename test/{elf => }/no-undefined-version.sh (100%) rename test/{elf => }/nocopyreloc.sh (95%) rename test/{elf => }/noinhibit-exec.sh (100%) rename test/{elf => }/non-canonical-plt.sh (100%) rename test/{elf => }/nostdlib.sh (100%) rename test/{elf => }/oformat-binary.sh (100%) rename test/{elf => }/omagic.sh (100%) create mode 100755 test/package-metadata.sh rename test/{elf => }/physical-image-base.sh (100%) rename test/{elf => }/pie.sh (100%) rename test/{elf => }/plt-dso.sh (100%) rename test/{elf => }/pltgot.sh (100%) rename test/{elf => }/preinit-array.sh (100%) rename test/{elf => }/print-dependencies.sh (100%) rename test/{elf => }/protected-dynsym.sh (100%) rename test/{elf => }/protected.sh (100%) rename test/{elf => }/push-pop-state.sh (100%) rename test/{elf => }/range-extension-thunk.sh (83%) rename test/{elf => }/range-extension-thunk2.sh (100%) create mode 100755 test/range-extension-thunk3.sh rename test/{elf => }/relax-got-load.sh (100%) rename test/{elf => }/reloc-rodata.sh (100%) rename test/{elf => }/relocatable-archive.sh (100%) rename test/{elf => }/relocatable-c++.sh (88%) create mode 100755 test/relocatable-compressed-debug-info.sh rename test/{elf => }/relocatable-debug-info.sh (74%) rename test/{elf => }/relocatable-exception.sh (100%) rename test/{elf => }/relocatable-many-sections.sh (100%) rename test/{elf => }/relocatable-merge-sections.sh (100%) rename test/{elf => }/relocatable-mergeable-sections.sh (100%) rename test/{elf => }/relocatable.sh (100%) rename test/{elf => }/relro.sh (100%) rename test/{elf => }/repro.sh (71%) rename test/{elf => }/require-defined.sh (100%) rename test/{elf => }/response-file.sh (100%) rename test/{elf => }/response-file2.sh (100%) rename test/{elf => }/retain-symbols-file.sh (69%) rename test/{elf => }/reverse-sections.sh (100%) rename test/{elf => }/rodata-name.sh (100%) rename test/{elf => }/rosegment.sh (100%) rename test/{elf => }/rpath.sh (100%) rename test/{elf => }/run-clang.sh (94%) create mode 100755 test/run.sh rename test/{elf => }/section-align.sh (100%) create mode 100755 test/section-attributes.sh rename test/{elf => }/section-order.sh (96%) rename test/{elf => }/section-start.sh (100%) create mode 100755 test/separate-debug-file.sh create mode 100755 test/shared-abs-sym.sh rename test/{elf => }/shared.sh (100%) rename test/{elf => }/shuffle-sections-seed.sh (100%) rename test/{elf => }/shuffle-sections.sh (100%) rename test/{elf => }/soname.sh (100%) create mode 100755 test/spare-program-headers.sh rename test/{elf => }/start-lib.sh (100%) rename test/{elf => }/start-stop-symbol.sh (100%) rename test/{elf => }/start-stop.sh (100%) rename test/{elf => }/static-archive.sh (100%) rename test/{elf => }/static-pie.sh (100%) rename test/{elf => }/stdout.sh (100%) rename test/{elf => }/strip-debug.sh (100%) rename test/{elf => }/strip.sh (79%) create mode 100755 test/stt-common.sh rename test/{elf => }/symbol-rank.sh (100%) rename test/{elf => }/symbol-version-lto.sh (92%) rename test/{elf => }/symbol-version.sh (100%) rename test/{elf => }/symbol-version2.sh (100%) rename test/{elf => }/symbol-version3.sh (100%) create mode 100755 test/symbol-version4.sh rename test/{elf => }/symtab-dso.sh (100%) rename test/{elf => }/symtab-section-symbols.sh (100%) rename test/{elf => }/symtab.sh (100%) rename test/{elf => }/synthetic-symbols.sh (100%) rename test/{elf => }/sysroot-linker-script.sh (100%) rename test/{elf => }/sysroot.sh (100%) rename test/{elf => }/sysroot2.sh (100%) rename test/{elf => }/tail-call.sh (100%) create mode 100755 test/tbss-only.sh rename test/{elf => }/thin-archive.sh (93%) rename test/{elf => }/thread-count.sh (100%) rename test/{elf => }/tls-alignment-multi.sh (100%) rename test/{elf => }/tls-common.sh (88%) rename test/{elf => }/tls-df-static-tls.sh (100%) rename test/{elf => }/tls-dso.sh (100%) rename test/{elf => }/tls-gd-dlopen.sh (100%) rename test/{elf => }/tls-gd-noplt.sh (99%) rename test/{elf => }/tls-gd-to-ie.sh (100%) rename test/{elf => }/tls-gd.sh (99%) rename test/{elf => }/tls-ie.sh (100%) rename test/{elf => }/tls-irregular-start-addr.sh (100%) rename test/{elf => }/tls-large-alignment.sh (100%) rename test/{elf => }/tls-large-static-image.sh (100%) rename test/{elf => }/tls-ld-noplt.sh (100%) rename test/{elf => }/tls-ld.sh (100%) rename test/{elf => }/tls-le-error.sh (100%) rename test/{elf => }/tls-le.sh (78%) rename test/{elf => }/tls-nopic.sh (100%) rename test/{elf => }/tls-pic.sh (100%) rename test/{elf => }/tls-small-alignment.sh (100%) rename test/{elf => }/tlsdesc-dlopen.sh (100%) rename test/{elf => }/tlsdesc-import.sh (100%) rename test/{elf => }/tlsdesc-initial-exec.sh (72%) rename test/{elf => }/tlsdesc-local-dynamic.sh (100%) rename test/{elf => }/tlsdesc-static.sh (100%) rename test/{elf => }/tlsdesc.sh (100%) create mode 100755 test/trace-symbol-symver.sh rename test/{elf => }/trace-symbol.sh (100%) rename test/{elf => }/trace.sh (100%) create mode 100755 test/undefined-glob-gc-sections.sh create mode 100755 test/undefined-glob.sh rename test/{elf => }/undefined.sh (100%) rename test/{elf => }/undefined2.sh (100%) create mode 100755 test/unkown-section-type.sh rename test/{elf => }/unresolved-symbols.sh (100%) create mode 100755 test/unresolved-symbols2.sh rename test/{elf => }/verbose.sh (100%) rename test/{elf => }/version-script-search-paths.sh (100%) rename test/{elf => }/version-script.sh (100%) rename test/{elf => }/version-script10.sh (100%) rename test/{elf => }/version-script11.sh (100%) rename test/{elf => }/version-script12.sh (100%) rename test/{elf => }/version-script13.sh (100%) rename test/{elf => }/version-script14.sh (100%) rename test/{elf => }/version-script15.sh (100%) rename test/{elf => }/version-script16.sh (100%) rename test/{elf => }/version-script17.sh (100%) rename test/{elf => }/version-script18.sh (100%) rename test/{elf => }/version-script19.sh (100%) rename test/{elf => }/version-script2.sh (100%) create mode 100755 test/version-script20.sh create mode 100755 test/version-script21.sh create mode 100755 test/version-script22.sh create mode 100755 test/version-script23.sh rename test/{elf => }/version-script3.sh (100%) rename test/{elf => }/version-script4.sh (100%) rename test/{elf => }/version-script5.sh (100%) rename test/{elf => }/version-script6.sh (100%) rename test/{elf => }/version-script7.sh (100%) rename test/{elf => }/version-script8.sh (100%) rename test/{elf => }/version-script9.sh (100%) rename test/{elf => }/version.sh (58%) rename test/{elf => }/versioned-undef.sh (100%) rename test/{elf => }/visibility.sh (100%) rename test/{elf => }/warn-common.sh (100%) rename test/{elf => }/warn-once.sh (83%) rename test/{elf => }/warn-symbol-type.sh (100%) rename test/{elf => }/warn-unresolved-symbols.sh (100%) rename test/{elf => }/weak-export-dso.sh (100%) create mode 100755 test/weak-export-dso2.sh rename test/{elf => }/weak-export-exe.sh (100%) rename test/{elf => }/weak-undef-dso.sh (100%) rename test/{elf => }/weak-undef.sh (100%) rename test/{elf => }/weak-undef2.sh (100%) rename test/{elf => }/weak-undef4.sh (100%) create mode 100755 test/weak-undef5.sh rename test/{elf => }/whole-archive.sh (61%) rename test/{elf => }/wrap-lto.sh (96%) rename test/{elf => }/wrap.sh (100%) rename test/{elf => }/z-cet-report.sh (100%) rename test/{elf => }/z-defs.sh (100%) rename test/{elf => }/z-dynamic-undefined-weak.sh (100%) rename test/{elf => }/z-max-page-size.sh (100%) rename test/{elf => }/z-nodefaultlib.sh (100%) rename test/{elf => }/z-nodump.sh (100%) rename test/{elf => }/z-now.sh (100%) rename test/{elf => }/z-origin.sh (100%) create mode 100755 test/z-pack-relative-relocs.sh create mode 100755 test/z-rodynamic.sh rename test/{elf => }/z-sectionheader.sh (100%) rename test/{elf => }/z-separate-code.sh (100%) rename test/{elf => }/z-stack-size.sh (100%) create mode 100755 test/z-start-stop-visibility.sh rename test/{elf => }/z-unknown.sh (100%) create mode 100644 third-party/blake3/.git-blame-ignore-revs create mode 100644 third-party/blake3/rust/guts/Cargo.toml create mode 100644 third-party/blake3/rust/guts/readme.md create mode 100644 third-party/blake3/rust/guts/src/lib.rs create mode 100644 third-party/blake3/rust/guts/src/portable.rs create mode 100644 third-party/blake3/rust/guts/src/test.rs create mode 100644 third-party/blake3/src/io.rs create mode 100644 third-party/mimalloc/SECURITY.md create mode 100644 third-party/mimalloc/bin/readme.md create mode 100644 third-party/mimalloc/docker/alpine-arm32v7/Dockerfile create mode 100644 third-party/mimalloc/docker/alpine/Dockerfile create mode 100644 third-party/mimalloc/docker/manylinux-x64/Dockerfile create mode 100644 third-party/mimalloc/docker/readme.md create mode 100644 third-party/mimalloc/src/free.c create mode 100644 third-party/mimalloc/src/libc.c create mode 100644 third-party/mimalloc/src/prim/emscripten/prim.c create mode 100644 third-party/tbb/.github/CODEOWNERS create mode 100644 third-party/tbb/.github/issue_labeler.yml create mode 100644 third-party/tbb/.github/workflows/issue_labeler.yml create mode 100644 third-party/tbb/MODULE.bazel create mode 100644 third-party/tbb/SUPPORT.md create mode 100644 third-party/tbb/cmake/resumable_tasks.cmake create mode 100644 third-party/tbb/doc/main/intro/limitations.rst create mode 100644 third-party/tbb/doc/main/reference/rvalue_reduce.rst rename third-party/tbb/doc/main/tbb_userguide/{Package_Contents.rst => Package_Contents_os.rst} (93%) create mode 100644 third-party/tbb/examples/migration/README.md create mode 100644 third-party/tbb/examples/migration/recursive_fibonacci/CMakeLists.txt create mode 100644 third-party/tbb/examples/migration/recursive_fibonacci/README.md create mode 100644 third-party/tbb/examples/migration/recursive_fibonacci/fibonacci.cpp create mode 100644 third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_single_task.h create mode 100644 third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_two_tasks.h create mode 100644 third-party/tbb/examples/migration/recursive_fibonacci/task_emulation_layer.h create mode 100644 third-party/tbb/examples/parallel_reduce/pi/CMakeLists.txt create mode 100644 third-party/tbb/examples/parallel_reduce/pi/README.md create mode 100644 third-party/tbb/examples/parallel_reduce/pi/common.h create mode 100644 third-party/tbb/examples/parallel_reduce/pi/main.cpp create mode 100644 third-party/tbb/examples/parallel_reduce/pi/pi.cpp create mode 100644 third-party/tbb/include/tbb/mutex.h create mode 100644 third-party/tbb/include/tbb/rw_mutex.h create mode 100644 third-party/tbb/integration/linux/oneapi/vars.sh create mode 100644 third-party/tbb/integration/windows/oneapi/vars.bat create mode 100644 third-party/tbb/src/tbb/cancellation_disseminator.h create mode 100644 third-party/tbb/src/tbb/permit_manager.h create mode 100644 third-party/tbb/src/tbb/pm_client.h create mode 100644 third-party/tbb/src/tbb/tcm.h create mode 100644 third-party/tbb/src/tbb/tcm_adaptor.cpp create mode 100644 third-party/tbb/src/tbb/tcm_adaptor.h create mode 100644 third-party/tbb/src/tbb/thread_control_monitor.h create mode 100644 third-party/tbb/src/tbb/thread_dispatcher.cpp create mode 100644 third-party/tbb/src/tbb/thread_dispatcher.h create mode 100644 third-party/tbb/src/tbb/thread_dispatcher_client.h create mode 100644 third-party/tbb/src/tbb/thread_request_serializer.cpp create mode 100644 third-party/tbb/src/tbb/thread_request_serializer.h create mode 100644 third-party/tbb/src/tbb/threading_control.cpp create mode 100644 third-party/tbb/src/tbb/threading_control.h create mode 100644 third-party/tbb/src/tbb/threading_control_client.h create mode 100755 third-party/tbb/src/tbbbind/def/mac64-tbbbind.def create mode 100644 third-party/tbb/test/tbb/test_fuzzing.cpp delete mode 100644 third-party/zlib/.github/workflows/cmake.yml delete mode 100644 third-party/zlib/.github/workflows/configure.yml delete mode 100644 third-party/zlib/.github/workflows/fuzz.yml delete mode 100644 third-party/zlib/.gitignore create mode 100644 third-party/zlib/contrib/nuget/nuget.csproj create mode 100644 third-party/zlib/contrib/nuget/nuget.sln create mode 100644 third-party/zlib/contrib/vstudio/vc17/miniunz.vcxproj create mode 100644 third-party/zlib/contrib/vstudio/vc17/minizip.vcxproj create mode 100644 third-party/zlib/contrib/vstudio/vc17/testzlib.vcxproj create mode 100644 third-party/zlib/contrib/vstudio/vc17/testzlibdll.vcxproj create mode 100644 third-party/zlib/contrib/vstudio/vc17/zlib.rc create mode 100644 third-party/zlib/contrib/vstudio/vc17/zlibstat.vcxproj create mode 100644 third-party/zlib/contrib/vstudio/vc17/zlibvc.def create mode 100644 third-party/zlib/contrib/vstudio/vc17/zlibvc.sln create mode 100644 third-party/zlib/contrib/vstudio/vc17/zlibvc.vcxproj create mode 100644 third-party/zstd/.github/workflows/commit.yml create mode 100644 third-party/zstd/.github/workflows/nightly.yml delete mode 100644 third-party/zstd/.travis.yml create mode 100644 third-party/zstd/SECURITY.md delete mode 100644 third-party/zstd/appveyor.yml delete mode 100644 third-party/zstd/build/cmake/zstdConfig.cmake create mode 100644 third-party/zstd/build/cmake/zstdConfig.cmake.in create mode 100644 third-party/zstd/doc/decompressor_permissive.md create mode 100644 third-party/zstd/programs/lorem.c create mode 100644 third-party/zstd/programs/lorem.h create mode 100755 third-party/zstd/tests/cli-tests/basic/args.sh create mode 100644 third-party/zstd/tests/cli-tests/basic/args.sh.exit create mode 100644 third-party/zstd/tests/cli-tests/basic/args.sh.stderr.glob create mode 100755 third-party/zstd/tests/cli-tests/decompression/detectErrors.sh create mode 100755 third-party/zstd/tests/cli-tests/file-handling/directory-mirror.sh create mode 100644 third-party/zstd/tests/cli-tests/file-handling/directory-mirror.sh.stderr.exact create mode 100644 third-party/zstd/tests/cli-tests/file-handling/directory-mirror.sh.stdout.exact create mode 100644 third-party/zstd/tests/fuzz/decompress_cross_format.c create mode 100644 third-party/zstd/tests/fuzz/generate_sequences.c create mode 100644 third-party/zstd/tests/golden-decompression-errors/.gitignore create mode 100644 third-party/zstd/tests/golden-decompression-errors/off0.bin.zst create mode 100644 third-party/zstd/tests/golden-decompression-errors/zeroSeq_extraneous.zst create mode 100644 third-party/zstd/tests/golden-decompression/block-128k.zst create mode 100644 third-party/zstd/tests/golden-decompression/empty-block.zst create mode 100644 third-party/zstd/tests/golden-decompression/rle-first-block.zst create mode 100644 third-party/zstd/tests/golden-decompression/zeroSeq_2B.zst create mode 100644 third-party/zstd/tests/loremOut.c create mode 100644 third-party/zstd/tests/loremOut.h diff --git a/.github/workflows/build-all.yml b/.github/workflows/build-all.yml new file mode 100644 index 00000000..62617826 --- /dev/null +++ b/.github/workflows/build-all.yml @@ -0,0 +1,46 @@ +name: Build all tarballs + +on: + schedule: + - cron: '0 0 * * *' + release: + types: [ published ] + workflow_dispatch: + +jobs: + build-tarballs: + runs-on: ubuntu-latest + + strategy: + matrix: + target: [ x86_64, aarch64, arm, riscv64, ppc64le, s390x ] + + permissions: + contents: read + packages: write + + steps: + - name: Checkout Repository + uses: actions/checkout@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build a tarball + run: ./dist.sh ${{ matrix.target }} + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.target }} + path: mold-*.tar.gz diff --git a/.github/workflows/build-x86.yml b/.github/workflows/build-x86.yml new file mode 100644 index 00000000..828a9f14 --- /dev/null +++ b/.github/workflows/build-x86.yml @@ -0,0 +1,37 @@ +name: Build x86 tarball + +on: + push: + branches: [ main ] + workflow_dispatch: + +jobs: + build-tarball: + runs-on: ubuntu-latest + + permissions: + contents: read + packages: write + + steps: + - name: Checkout Repository + uses: actions/checkout@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build a tarball + run: ./dist.sh + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: tarball + path: mold-*.tar.gz diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3b05257f..16e27fac 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,133 +1,131 @@ name: CI on: push: - branches: [ main ] pull_request: - branches: [ main ] env: UBSAN_OPTIONS: print_stacktrace=1:halt_on_error=1 jobs: - build-clang: + build-sanitizers: strategy: matrix: target: - # Disable PCH for the default configuration. This prevents relying on implicit includes. - - '-DCMAKE_DISABLE_PRECOMPILE_HEADERS=On' + - '' - '-DMOLD_USE_ASAN=On' - '-DMOLD_USE_TSAN=On' - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v3 - uses: rui314/setup-mold@staging - - name: install-build-deps - run: sudo ./install-build-deps.sh update - - name: ccache - uses: hendrikmuhs/ccache-action@v1 - - name: build and test + - run: sudo ./install-build-deps.sh + - name: build run: | - echo "/usr/lib/ccache:/usr/local/opt/ccache/libexec" >> $GITHUB_PATH - sudo apt-get install -y clang++-12 + sudo apt-get install -y clang-18 clang gcc-multilib gdb dwarfdump zstd mkdir build cd build - cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++-12 ${{ matrix.target }} .. + cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_COMPILER=clang-18 -DCMAKE_CXX_COMPILER=clang++-18 ${{ matrix.target }} .. cmake --build . -j$(nproc) - ctest . -j$(nproc) + - run: ctest --test-dir build -j$(nproc) - name: archive test results uses: actions/upload-artifact@v3 if: failure() with: name: test-results-clang path: | - build/out - build/Testing + build + !build/CMakeFiles - build-gcc: - runs-on: ubuntu-20.04 + build-multi-archs: + runs-on: ubuntu-latest container: gcc:11.1.0 steps: - uses: actions/checkout@v3 - name: install-build-deps + shell: bash run: | # Install cross toolchains - dpkg --add-architecture i386 - ./install-build-deps.sh update - apt-get install -y sudo qemu-user gdb zstd dwarfdump xz-utils {gcc,g++}-10-{i686,aarch64,riscv64,powerpc,powerpc64,powerpc64le,s390x,sparc64,m68k,sh4,alpha}-linux-gnu {gcc,g++}-10-arm-linux-gnueabihf - - for i in {i686,aarch64,riscv64,powerpc,powerpc64,powerpc64le,s390x,sparc64,m68k,sh4,alpha}-linux-gnu arm-linux-gnueabihf; do - ln -sf /usr/bin/$i-gcc-10 /usr/bin/$i-gcc - ln -sf /usr/bin/$i-g++-10 /usr/bin/$i-g++ - done + ./install-build-deps.sh + ./install-cross-tools.sh # Install a RV32 toolchain from third party since it's not available # as an Ubuntu package. - mkdir /usr/local/rv32 - wget -O- -q https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.07.07/riscv32-glibc-ubuntu-20.04-gcc-nightly-2023.07.07-nightly.tar.gz | tar -C /usr/local/rv32 --strip-components=1 -xzf - + mkdir /rv32 + wget -O- -q https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.07.07/riscv32-glibc-ubuntu-20.04-gcc-nightly-2023.07.07-nightly.tar.gz | tar -C /rv32 --strip-components=1 -xzf - - ln -sf /usr/local/rv32/sysroot /usr/riscv32-linux-gnu - echo '/usr/local/rv32/bin/riscv32-unknown-linux-gnu-gcc -L/usr/riscv32-linux-gnu "$@"' > /usr/bin/riscv32-linux-gnu-gcc - echo '/usr/local/rv32/bin/riscv32-unknown-linux-gnu-g++ -L/usr/riscv32-linux-gnu "$@"' > /usr/bin/riscv32-linux-gnu-g++ + ln -sf /rv32/sysroot /usr/riscv32-linux-gnu + echo '/rv32/bin/riscv32-unknown-linux-gnu-gcc -L/usr/riscv32-linux-gnu "$@"' > /usr/bin/riscv32-linux-gnu-gcc + echo '/rv32/bin/riscv32-unknown-linux-gnu-g++ -L/usr/riscv32-linux-gnu "$@"' > /usr/bin/riscv32-linux-gnu-g++ chmod 755 /usr/bin/riscv32-linux-gnu-{gcc,g++} for i in objdump objcopy strip; do - ln -sf /usr/local/rv32/bin/riscv32-unknown-linux-gnu-$i /usr/bin/riscv32-linux-gnu-$i + ln -sf /rv32/bin/riscv32-unknown-linux-gnu-$i /usr/bin/riscv32-linux-gnu-$i done # Install a LoongArch toolchain - mkdir /usr/local/larch - wget -O- -q https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz | tar -C /usr/local/larch --strip-components=1 --xz -xf - + mkdir /larch + wget -O- -q https://github.com/loongson/build-tools/releases/download/2024.08.08/x86_64-cross-tools-loongarch64-binutils_2.43-gcc_14.2.0-glibc_2.40.tar.xz | tar -C /larch --strip-components=1 --xz -xf - - ln -sf /usr/local/larch/target /usr/loongarch64-linux-gnu - cp -r /usr/local/larch/loongarch64-unknown-linux-gnu/lib/* /usr/loongarch64-linux-gnu/lib64/ + cp -r /larch/loongarch64-unknown-linux-gnu/lib/* /larch/target/lib64 + ln -sf /larch/target /usr/loongarch64-linux-gnu - for i in objdump objcopy strip; do - ln -sf /usr/local/larch/bin/loongarch64-unknown-linux-gnu-$i /usr/bin/loongarch64-linux-gnu-$i + for i in gcc g++ objdump objcopy strip; do + ln -sf /larch/bin/loongarch64-unknown-linux-gnu-$i /usr/bin/loongarch64-linux-gnu-$i done - echo '/usr/local/larch/bin/loongarch64-unknown-linux-gnu-gcc -L/usr/local/larch/loongarch64-unknown-linux-gnu "$@"' > /usr/bin/loongarch64-linux-gnu-gcc - echo '/usr/local/larch/bin/loongarch64-unknown-linux-gnu-g++ -L/usr/local/larch/loongarch64-unknown-linux-gnu "$@"' > /usr/bin/loongarch64-linux-gnu-g++ - chmod 755 /usr/bin/loongarch64-linux-gnu-{gcc,g++} - - wget -O/usr/local/bin/qemu-loongarch64 -q https://github.com/loongson/build-tools/releases/download/2023.08.08/qemu-loongarch64 + wget -O /usr/local/bin/qemu-loongarch64 -q https://github.com/loongson/build-tools/releases/download/2023.08.08/qemu-loongarch64 chmod 755 /usr/local/bin/qemu-loongarch64 - shell: bash - - name: ccache - uses: hendrikmuhs/ccache-action@v1 - - name: build and test + # Install Intel SDE CPU emulator for CET-related tests + mkdir /sde + wget -O- -q https://downloadmirror.intel.com/813591/sde-external-9.33.0-2024-01-07-lin.tar.xz | tar -C /sde --strip-components=1 --xz -xf - + ln -s /sde/sde /usr/bin + - name: build run: | - echo "/usr/lib/ccache:/usr/local/opt/ccache/libexec" >> $GITHUB_PATH mkdir build cd build cmake .. cmake --build . -j$(nproc) - ctest . -j$(nproc) + - run: ctest --test-dir build -j$(nproc) - name: archive test results uses: actions/upload-artifact@v3 if: failure() with: name: test-results-gcc path: | - build/out - build/Testing + build + !build/CMakeFiles - build-macos: - runs-on: macos-11 + build-distros: strategy: matrix: - target: - # Disable PCH for the default configuration. This prevents relying on implicit includes. - - '-DCMAKE_DISABLE_PRECOMPILE_HEADERS=On' - - '-DMOLD_USE_ASAN=On' + distro: + - alpine + - archlinux + - fedora + - gentoo/stage3 + - opensuse/tumbleweed + - ubuntu:22.04 + runs-on: ubuntu-latest + container: ${{ matrix.distro }} + steps: + - uses: actions/checkout@v2 + - run: ./install-build-deps.sh + - name: build + run: | + mkdir build + cd build + cmake .. + cmake --build . -j$(nproc) + - run: ctest --test-dir build -j$(nproc) + + build-macos: + runs-on: macos-12 steps: - uses: actions/checkout@v3 - - name: ccache - uses: hendrikmuhs/ccache-action@v1 - - name: build and test + - name: build run: | - echo "/usr/lib/ccache:/usr/local/opt/ccache/libexec" >> $GITHUB_PATH mkdir build cd build - cmake ${{ matrix.target }} .. + cmake .. cmake --build . -j$(sysctl -n hw.physicalcpu) build-windows: @@ -140,3 +138,37 @@ jobs: cd build cmake -T clangcl .. cmake --build . -j $Env:NUMBER_OF_PROCESSORS + + build-msys: + runs-on: windows-latest + steps: + - uses: actions/checkout@v3 + - name: Setup MSYS2 + uses: msys2/setup-msys2@v2 + with: + msystem: UCRT64 + update: true + pacboy: gcc-libs:p libwinpthread-git:p tbb:p zlib:p zstd:p dlfcn:p cc:p cmake:p ninja:p + - name: build + shell: msys2 {0} + run: | + mkdir build + cd build + cmake -GNinja -DMOLD_USE_MIMALLOC=OFF -DMOLD_USE_SYSTEM_TBB=ON .. + cmake --build . -j $(nproc) + + build-freebsd: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Build and test + uses: vmactions/freebsd-vm@v1 + with: + usesh: true + run: | + ./install-build-deps.sh + mkdir build + cd build + cmake .. + cmake --build . -j$(nproc) + ctest -j$(nproc) diff --git a/.github/workflows/linux-packages.yml b/.github/workflows/linux-packages.yml deleted file mode 100644 index ee4fc916..00000000 --- a/.github/workflows/linux-packages.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: linux-packages -on: - push: - branches: [ main ] -jobs: - build: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - name: build - run: ./dist.sh x86_64 - - name: upload - uses: actions/upload-artifact@v3 - with: - path: mold-*-linux.tar.gz diff --git a/.github/workflows/update-manpage.yml b/.github/workflows/update-manpage.yml new file mode 100644 index 00000000..1107e774 --- /dev/null +++ b/.github/workflows/update-manpage.yml @@ -0,0 +1,33 @@ +name: Update manpage + +on: + push: + paths: + - 'docs/mold.md' + branches: + - main + workflow_dispatch: + +jobs: + update-manpage: + runs-on: ubuntu-latest + + steps: + - name: Check out repository + uses: actions/checkout@v2 + + - name: Install ronn + run: sudo apt-get update && sudo apt-get install -y ronn + + - name: Generate mold.1 from mold.md + run: ronn --roff docs/mold.md + + - name: Configure Git + run: | + git config --global user.name 'Rui Ueyama' + git config --global user.email 'rui314@gmail.com' + + - name: Commit and push if mold.1 is updated + run: | + git add docs/mold.1 + git diff --staged --quiet || (git commit -m "Update mold.1 (automated commit)" && git push) diff --git a/.gitignore b/.gitignore index fe8fea00..f5bfbc40 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ core gentoo /mold-*.tar.gz /build* +/mold diff --git a/CMakeLists.txt b/CMakeLists.txt index 711dbd89..d6d1500a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,31 +1,31 @@ # You can customize a build by specifying CMake options. An option may be # given in the -Dvariable=value form. For a boolean variable, `ON` or `1` -# means true while `OFF` or `0` means false. +# means true, while `OFF` or `0` means false. # # Here are a couple of common cmake options: # # -DCMAKE_C_COMPILER= # -# Specifies a C compiler name to use. The default value is `cc`. +# Specifies the C compiler name to use. The default value is `cc`. # # -DCMAKE_CXX_COMPILER= # -# Specifies a C++ compiler name to use. The default value is `c++`. +# Specifies the C++ compiler name to use. The default value is `c++`. # # -DCMAKE_INSTALL_PREFIX= # -# Specifies an install target directory. The default value is `/usr/local`. +# Specifies the install target directory. The default value is `/usr/local`. # # -DCMAKE_BUILD_TYPE=[Debug | Release | RelWithDebInfo | MinSizeRel] # -# Specifies a build type. The default is `Release` which is the right +# Specifies the build type. The default is `Release`, which is the right # option unless you are debugging mold. # # An example of a cmake command line is shown below: # # $ cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_CXX_COMPILER=g++-12 .. # -# where `..` refers this directory. +# where `..` refers to this directory. # # With cmake, you may run `cmake --install .` instead of `make install` to # install build artifacts to system directories. If you want to install @@ -39,14 +39,15 @@ # how to build mold. However, as a policy, we do not provide a way to # enable/disable any individual mold's feature. In other words, we do not # provide options like `--enable-foo` or `--disable-foo`. The motivation -# behind it is build reproducibility. We want to guarantees that all builds -# of the mold linker of the same version will have the exactly same set of +# behind this is build reproducibility. We want to guarantee that all builds +# of the mold linker of the same version will have the exact same set of # features and behave exactly the same. cmake_minimum_required(VERSION 3.14) -project(mold VERSION 2.3.3) +project(mold VERSION 2.34.0) include(CMakeDependentOption) +include(CheckSymbolExists) include(GNUInstallDirs) # Build mold itself using mold if -DMOLD_USE_MOLD=ON @@ -57,7 +58,12 @@ endif() add_executable(mold) target_compile_features(mold PRIVATE cxx_std_20) -target_link_libraries(mold PRIVATE ${CMAKE_DL_LIBS}) + +if(MINGW) + target_link_libraries(mold PRIVATE dl) +else() + target_link_libraries(mold PRIVATE ${CMAKE_DL_LIBS}) +endif() if(NOT "${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "MSVC") target_compile_options(mold PRIVATE @@ -106,7 +112,7 @@ endif() # This option is intended to be used by `./dist.sh` script to create a # mold binary that works on various Linux distros. You probably don't # need nor want to set this to ON. -option(MOLD_MOSTLY_STATIC "Statically link libstdc++ and libcrypto" OFF) +option(MOLD_MOSTLY_STATIC "Statically link libstdc++ and some other libraries" OFF) if(MOLD_MOSTLY_STATIC) target_link_options(mold PRIVATE -static-libstdc++) endif() @@ -117,6 +123,7 @@ find_package(ZLIB QUIET) if(ZLIB_FOUND AND NOT MOLD_MOSTLY_STATIC) target_link_libraries(mold PRIVATE ZLIB::ZLIB) else() + set(ZLIB_BUILD_EXAMPLES OFF CACHE INTERNAL "") add_subdirectory(third-party/zlib EXCLUDE_FROM_ALL) target_include_directories(zlibstatic INTERFACE third-party/zlib $) @@ -129,12 +136,17 @@ find_package(BLAKE3 QUIET) if(BLAKE3_FOUND AND NOT MOLD_MOSTLY_STATIC) target_link_libraries(mold PRIVATE BLAKE3::blake3) else() - add_subdirectory(third-party/blake3/c EXCLUDE_FROM_ALL) - target_link_libraries(mold PRIVATE blake3) - target_include_directories(mold PUBLIC third-party/blake3/c) + function(mold_add_blake3) + set(BUILD_SHARED_LIBS OFF) + add_subdirectory(third-party/blake3/c EXCLUDE_FROM_ALL) + target_link_libraries(mold PRIVATE blake3) + target_include_directories(mold PUBLIC third-party/blake3/c) + endfunction() + + mold_add_blake3() endif() -# Find zstd compression library. If libzstd.so is not found, we compile a +# Find zstd compression library. If zstd.h is not found, we compile a # bundled one and statically-link it to mold. include(CheckIncludeFile) check_include_file(zstd.h HAVE_ZSTD_H) @@ -212,11 +224,6 @@ else() mold_add_tbb() endif() -# Check if this is a commercial version of mold (a.k.a. "sold") -if(EXISTS "${CMAKE_SOURCE_DIR}/LICENSE.md") - set(MOLD_IS_SOLD ON) -endif() - # We always use Clang to build mold on Windows. MSVC can't compile mold. if(WIN32) if(MSVC AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang") @@ -245,7 +252,7 @@ if(NOT APPLE AND NOT WIN32) # Remove the default `lib` prefix set_target_properties(mold-wrapper PROPERTIES PREFIX "") target_link_libraries(mold-wrapper PRIVATE ${CMAKE_DL_LIBS}) - target_sources(mold-wrapper PRIVATE elf/mold-wrapper.c) + target_sources(mold-wrapper PRIVATE src/mold-wrapper.c) endif() # If atomics doesn't work by default, add -latomic. @@ -270,20 +277,22 @@ if(NOT APPLE AND NOT MSVC) target_link_options(mold PRIVATE -pthread) endif() +check_symbol_exists(madvise sys/mman.h HAVE_MADVISE) + # Create a .cc file containing the current git hash for `mold --version`. add_custom_target(git_hash COMMAND ${CMAKE_COMMAND} -DSOURCE_DIR=${CMAKE_SOURCE_DIR} -DOUTPUT_FILE=${CMAKE_BINARY_DIR}/git-hash.cc - -P ${CMAKE_SOURCE_DIR}/common/update-git-hash.cmake - DEPENDS common/update-git-hash.cmake + -P ${CMAKE_SOURCE_DIR}/lib/update-git-hash.cmake + DEPENDS lib/update-git-hash.cmake BYPRODUCTS git-hash.cc VERBATIM) add_dependencies(mold git_hash) # Create config.h file -configure_file(common/config.h.in config.h) +configure_file(lib/config.h.in config.h) include_directories(${CMAKE_CURRENT_BINARY_DIR}) # Almost all functions are template in mold which take a target type @@ -296,43 +305,45 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) # on a multicore machine. list(APPEND MOLD_ELF_TARGETS X86_64 I386 ARM64 ARM32 RV32LE RV32BE RV64LE RV64BE PPC32 PPC64V1 PPC64V2 - S390X SPARC64 M68K SH4 ALPHA LOONGARCH32 LOONGARCH64) + S390X SPARC64 M68K SH4 LOONGARCH32 LOONGARCH64) list(APPEND MOLD_ELF_TEMPLATE_FILES - elf/arch-loongarch.cc - elf/arch-riscv.cc - elf/cmdline.cc - elf/gc-sections.cc - elf/gdb-index.cc - elf/icf.cc - elf/input-files.cc - elf/input-sections.cc - elf/jobs.cc - elf/linker-script.cc - elf/lto.cc - elf/main.cc - elf/mapfile.cc - elf/output-chunks.cc - elf/passes.cc - elf/relocatable.cc - elf/subprocess.cc - elf/thunks.cc - elf/tls.cc + src/arch-loongarch.cc + src/arch-riscv.cc + src/cmdline.cc + src/gc-sections.cc + src/gdb-index.cc + src/icf.cc + src/input-files.cc + src/input-sections.cc + src/linker-script.cc + src/main.cc + src/mapfile.cc + src/output-chunks.cc + src/passes.cc + src/relocatable.cc + src/shrink-sections.cc + src/thunks.cc + src/tls.cc ) -list(APPEND MOLD_MACHO_TARGETS X86_64 ARM64) - -list(APPEND MOLD_MACHO_TEMPLATE_FILES - macho/cmdline.cc - macho/dead-strip.cc - macho/input-files.cc - macho/input-sections.cc - macho/lto.cc - macho/main.cc - macho/mapfile.cc - macho/output-chunks.cc - macho/tapi.cc +if(WIN32 AND NOT MINGW) + list(APPEND MOLD_ELF_TEMPLATE_FILES src/lto-win32.cc) +else() + list(APPEND MOLD_ELF_TEMPLATE_FILES src/lto-unix.cc) +endif() + +if(WIN32) + list(APPEND MOLD_ELF_TEMPLATE_FILES + src/output-file-win32.cc + src/subprocess-win32.cc + ) +else() + list(APPEND MOLD_ELF_TEMPLATE_FILES + src/output-file-unix.cc + src/subprocess-unix.cc ) +endif() function(mold_instantiate_templates SOURCE TARGET) set(PATH ${CMAKE_BINARY_DIR}/${SOURCE}.${TARGET}.cc) @@ -351,72 +362,54 @@ foreach (SOURCE IN LISTS MOLD_ELF_TEMPLATE_FILES) endforeach() endforeach() -if(MOLD_IS_SOLD) - foreach (SOURCE IN LISTS MOLD_MACHO_TEMPLATE_FILES) - foreach(TARGET IN LISTS MOLD_MACHO_TARGETS) - mold_instantiate_templates(${SOURCE} ${TARGET}) - endforeach() - endforeach() -endif() - # Add other non-template source files. target_sources(mold PRIVATE - common/compress.cc - common/demangle.cc - common/filepath.cc - common/glob.cc - common/hyperloglog.cc - common/main.cc - common/multi-glob.cc - common/perf.cc - common/tar.cc - common/uuid.cc - elf/arch-alpha.cc - elf/arch-arm32.cc - elf/arch-arm64.cc - elf/arch-i386.cc - elf/arch-m68k.cc - elf/arch-ppc32.cc - elf/arch-ppc64v1.cc - elf/arch-ppc64v2.cc - elf/arch-s390x.cc - elf/arch-sh4.cc - elf/arch-sparc64.cc - elf/arch-x86-64.cc - elf/elf.cc git-hash.cc + lib/compress.cc + lib/crc32.cc + lib/demangle.cc + lib/filepath.cc + lib/glob.cc + lib/hyperloglog.cc + lib/malloc.cc + lib/multi-glob.cc + lib/perf.cc + lib/random.cc + lib/tar.cc + src/arch-arm32.cc + src/arch-arm64.cc + src/arch-i386.cc + src/arch-m68k.cc + src/arch-ppc32.cc + src/arch-ppc64v1.cc + src/arch-ppc64v2.cc + src/arch-s390x.cc + src/arch-sh4.cc + src/arch-sparc64.cc + src/arch-x86-64.cc + src/config.cc + src/elf.cc third-party/rust-demangle/rust-demangle.c ) -if(MOLD_IS_SOLD) +if(WIN32) target_sources(mold PRIVATE - macho/arch-arm64.cc - macho/arch-x86-64.cc - macho/yaml.cc + lib/jobs-win32.cc + lib/mapped-file-win32.cc + lib/signal-win32.cc + ) +else() + target_sources(mold PRIVATE + lib/jobs-unix.cc + lib/mapped-file-unix.cc + lib/signal-unix.cc ) -endif() - -# Add frequently included header files for pre-compiling. -# target_precompile_headers is supported by CMake 3.16.0 or newer. -if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.16.0") - if(MOLD_IS_SOLD) - target_precompile_headers(mold PRIVATE - "$<$:${CMAKE_SOURCE_DIR}/elf/mold.h>" - "$<$:${CMAKE_SOURCE_DIR}/macho/mold.h>") - else() - target_precompile_headers(mold PRIVATE - "$<$:${CMAKE_SOURCE_DIR}/elf/mold.h>") - endif() - - # ccache needs this flag along with `sloppiness = pch_defines,time_macros` - # to enable caching - target_compile_options(mold PRIVATE -fpch-preprocess) endif() include(CTest) if(BUILD_TESTING) - # Create the ld and ld64 symlinks required for testing + # Create the ld symlinks required for testing if(NOT WIN32) add_custom_command( TARGET mold POST_BUILD @@ -424,36 +417,18 @@ if(BUILD_TESTING) BYPRODUCTS ld WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} VERBATIM) - - if(MOLD_IS_SOLD) - add_custom_command( - TARGET mold POST_BUILD - COMMAND ${CMAKE_COMMAND} -E create_symlink mold ld64 - BYPRODUCTS ld64 - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - VERBATIM) - endif() endif() - if(${APPLE}) - if(MOLD_IS_SOLD) - add_subdirectory(test/macho) - endif() - elseif(${UNIX}) - add_subdirectory(test/elf) + if(${UNIX}) + add_subdirectory(test) endif() endif() if(NOT CMAKE_SKIP_INSTALL_RULES) install(TARGETS mold RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) install(FILES docs/mold.1 DESTINATION ${CMAKE_INSTALL_MANDIR}/man1/) - - if(EXISTS "${CMAKE_SOURCE_DIR}/LICENSE") - install(FILES LICENSE DESTINATION ${CMAKE_INSTALL_DOCDIR}) - endif() - if(EXISTS "${CMAKE_SOURCE_DIR}/LICENSE.third-party") - install(FILES "LICENSE.third-party" DESTINATION "${CMAKE_INSTALL_DOCDIR}") - endif() + install(FILES LICENSE DESTINATION ${CMAKE_INSTALL_DOCDIR}) + install(FILES "LICENSE.third-party" DESTINATION "${CMAKE_INSTALL_DOCDIR}") function(mold_install_relative_symlink OLD NEW) install(CODE " @@ -473,13 +448,4 @@ if(NOT CMAKE_SKIP_INSTALL_RULES) ${CMAKE_INSTALL_BINDIR}/ld.mold${CMAKE_EXECUTABLE_SUFFIX}) mold_install_relative_symlink(${CMAKE_INSTALL_MANDIR}/man1/mold.1 ${CMAKE_INSTALL_MANDIR}/man1/ld.mold.1) - - if(MOLD_IS_SOLD) - mold_install_relative_symlink(${CMAKE_INSTALL_BINDIR}/mold - ${CMAKE_INSTALL_BINDIR}/ld64.mold) - mold_install_relative_symlink(${CMAKE_INSTALL_BINDIR}/mold - ${CMAKE_INSTALL_BINDIR}/ld.sold) - mold_install_relative_symlink(${CMAKE_INSTALL_BINDIR}/mold - ${CMAKE_INSTALL_BINDIR}/ld64.sold) - endif() endif() diff --git a/README.md b/README.md index 3da937c0..c6ddf37b 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,30 @@ # mold: A Modern Linker -This repository contains a free version of the mold linker. -If you are looking for a commercial version that supports macOS -please visit the -[repository of the sold linker](https://github.com/bluewhalesystems/sold). - mold is a faster drop-in replacement for existing Unix linkers. It is several times quicker than the LLVM lld linker, the second-fastest open-source linker, which I initially developed a few years ago. mold aims to enhance developer productivity by minimizing build time, particularly in rapid debug-edit-rebuild cycles. -Here is a performance comparison of GNU gold, LLVM lld, and mold when linking -final debuginfo-enabled executables for major large programs on a simulated -8-core, 16-thread machine. +Here is a performance comparison of GNU ld, GNU gold, LLVM lld, and +mold when linking final debuginfo-enabled executables for major large +programs on a simulated 16-core, 32-thread machine. -![Link speed comparison](docs/comparison.png) +![Link speed comparison](docs/chart.svg) -| Program (linker output size) | GNU gold | LLVM lld | mold -|-------------------------------|----------|----------|-------- -| Chrome 96 (1.89 GiB) | 53.86s | 11.74s | 2.21s -| Clang 13 (3.18 GiB) | 64.12s | 5.82s | 2.90s -| Firefox 89 libxul (1.64 GiB) | 32.95s | 6.80s | 1.42s +| Program (linker output size) | GNU ld | GNU gold | LLVM lld | mold +|-------------------------------|--------|----------|----------|------ +| MySQL 8.3 (0.47 GiB) | 10.84s | 7.47s | 1.64s | 0.46s +| Clang 19 (1.56 GiB) | 42.07s | 33.13s | 5.20s | 1.35s +| Chromium 124 (1.35 GiB) | N/A | 27.40s | 6.10s | 1.52s mold is so fast that it is only 2x _slower_ than the `cp` command on the same -machine. If you find that mold is not faster than other linkers, please feel +machine. If you find that mold is not faster than other linkers, feel free to [file a bug report](https://github.com/rui314/mold/issues). mold supports x86-64, i386, ARM64, ARM32, 64-bit/32-bit little/big-endian RISC-V, 32-bit PowerPC, 64-bit big-endian PowerPC ELFv1, 64-bit little-endian -PowerPC ELFv2, s390x, 64-bit/32-bit LoongArch, SPARC64, m68k, SH-4, and DEC -Alpha. - -mold/macOS is commercial software. For mold/macOS, please visit -https://github.com/bluewhalesystems/sold. +PowerPC ELFv2, s390x, 64-bit/32-bit LoongArch, SPARC64, m68k, and SH-4. ## Why does linking speed matter? @@ -68,14 +59,12 @@ necessary packages. You may need to run it as root. ### Compile mold ```shell -git clone https://github.com/rui314/mold.git -mkdir mold/build -cd mold/build -git checkout v2.3.3 -../install-build-deps.sh -cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=c++ .. -cmake --build . -j $(nproc) -sudo cmake --build . --target install +git clone --branch stable https://github.com/rui314/mold.git +cd mold +./install-build-deps.sh +cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=c++ -B build +cmake --build build -j$(nproc) +sudo cmake --build build --target install ``` You might need to pass a C++20 compiler command name to `cmake`. In the @@ -100,7 +89,7 @@ use the mold executable within it. On Unix, the linker command (usually `/usr/bin/ld`) is indirectly invoked by the compiler driver (typically `cc`, `gcc`, or `clang`), which is in turn -indirectly invoked by `make` or another build system command. +indirectly invoked by `make` or other build system commands. If you can specify an additional command line option for your compiler driver by modifying the build system's config files, add one of the following flags @@ -143,7 +132,7 @@ may be able to remove the `linker = "clang"` line. ```toml [target.x86_64-unknown-linux-gnu] -rustflags = ["-C", "link-arg=-fuse-ld=/path/to/mold"] +rustflags = ["-C", "link-arg=-fuse-ld=mold"] ``` If you want to use mold for all projects, add the above snippet to @@ -173,18 +162,18 @@ If you want to use mold for all projects, add the above snippet to It is sometimes very hard to pass an appropriate command line option to `cc` to specify an alternative linker. To address this situation, mold has a -feature to intercept all invocations of `ld`, `ld.lld`, or `ld.gold` and -redirect them to itself. To use this feature, run `make` (or another build +feature to intercept all invocations of `ld`, `ld.bfd`, `ld.lld`, or `ld.gold` +and redirect them to itself. To use this feature, run `make` (or another build command) as a subcommand of mold as follows: ```shell mold -run make ``` -Internally, mold invokes the given command with the `LD_PRELOAD` environment +Internally, mold invokes a given command with the `LD_PRELOAD` environment variable set to its companion shared object file. The shared object file intercepts all function calls to `exec(3)`-family functions to replace -`argv[0]` with `mold` if it is `ld`, `ld.gold`, or `ld.lld`. +`argv[0]` with `mold` if it is `ld`, `ld.bf`, `ld.gold`, or `ld.lld`. @@ -192,7 +181,7 @@ intercepts all function calls to `exec(3)`-family functions to replace You can use our [setup-mold](https://github.com/rui314/setup-mold) GitHub Action to speed up GitHub-hosted continuous builds. Although GitHub Actions -run on a two-core machine, mold is still significantly faster than the default +run on a 4 core machine, mold is still significantly faster than the default GNU linker, especially when linking large programs. @@ -218,7 +207,7 @@ If `mold` is present in the `.comment` section, the file was created by mold. Since mold is a drop-in replacement, you should be able to use it without reading its manual. However, if you need it, [mold's man page](docs/mold.md) -is available. You can read the same manual by running `man mold`. +is available online. You can read the same manual by running `man mold`. @@ -242,10 +231,19 @@ For details, please see the [design notes](docs/design.md). ## Sponsors -We accept donations via [GitHub Sponsors](https://github.com/sponsors/rui314) -and [OpenCollective](https://opencollective.com/mold-linker). We thank -everyone who sponsors our project. In particular, we'd like to acknowledge the -following people and organizations who have sponsored $128/month or more: +It is taken for granted nowadays that compiler toolchains can be easily +installed and used for free, and people may not think too much about the +individuals behind these "free tools". mold supports many projects, but it +is essentially a one-person project. This situation is similar to the one +depicted in the following xkcd illustration. + +[![xkcd 2347](https://imgs.xkcd.com/comics/dependency.png)](https://xkcd.com/2347) + +If you think that the "Nebraska guy" should be rewarded, please consider +becoming our [GitHub sponsor](https://github.com/sponsors/rui314)! + +We thank everyone who sponsors our project. In particular, we'd like to acknowledge +the following people and organizations who have sponsored $128/month or more: ### Corporate sponsors @@ -255,13 +253,14 @@ following people and organizations who have sponsored $128/month or more: Emerge Tools
-- [Uber](https://uber.com) - [G-Research](https://www.gresearch.co.uk) - [Signal Slot Inc.](https://github.com/signal-slot) +- [GlareDB](https://github.com/GlareDB) ### Individual sponsors -- [300baud](https://github.com/300baud) -- [Johan Andersson](https://github.com/repi) - [Wei Wu](https://github.com/lazyparser) - [kyle-elliott](https://github.com/kyle-elliott) +- [Bryant Biggs](https://github.com/bryantbiggs) +- [kraptor23](https://github.com/kraptor23) +- [Jinkyu Yi](https://github.com/jincreator) diff --git a/common/cmdline.h b/common/cmdline.h deleted file mode 100644 index d8206ccb..00000000 --- a/common/cmdline.h +++ /dev/null @@ -1,101 +0,0 @@ -#pragma once - -#include "common.h" - -namespace mold { - -template -std::vector -read_response_file(Context &ctx, std::string_view path, i64 depth) { - if (depth > 10) - Fatal(ctx) << path << ": response file nesting too deep"; - - std::vector vec; - MappedFile *mf = MappedFile::must_open(ctx, std::string(path)); - std::string_view data((char *)mf->data, mf->size); - - while (!data.empty()) { - if (isspace(data[0])) { - data = data.substr(1); - continue; - } - - auto read_quoted = [&]() { - char quote = data[0]; - data = data.substr(1); - - std::string buf; - while (!data.empty() && data[0] != quote) { - if (data[0] == '\\' && data.size() >= 1) { - buf.append(1, data[1]); - data = data.substr(2); - } else { - buf.append(1, data[0]); - data = data.substr(1); - } - } - if (data.empty()) - Fatal(ctx) << path << ": premature end of input"; - data = data.substr(1); - return save_string(ctx, buf); - }; - - auto read_unquoted = [&] { - std::string buf; - while (!data.empty()) { - if (data[0] == '\\' && data.size() >= 1) { - buf.append(1, data[1]); - data = data.substr(2); - continue; - } - - if (!isspace(data[0])) { - buf.append(1, data[0]); - data = data.substr(1); - continue; - } - break; - } - return save_string(ctx, buf); - }; - - std::string_view tok; - if (data[0] == '\'' || data[0] == '\"') - tok = read_quoted(); - else - tok = read_unquoted(); - - if (tok.starts_with('@')) - append(vec, read_response_file(ctx, tok.substr(1), depth + 1)); - else - vec.push_back(tok); - } - return vec; -} - -// Replace "@path/to/some/text/file" with its file contents. -template -std::vector expand_response_files(Context &ctx, char **argv) { - std::vector vec; - for (i64 i = 0; argv[i]; i++) { - if (argv[i][0] == '@') - append(vec, read_response_file(ctx, argv[i] + 1, 1)); - else - vec.push_back(argv[i]); - } - return vec; -} - -static inline std::string_view string_trim(std::string_view str) { - size_t pos = str.find_first_not_of(" \t"); - if (pos == str.npos) - return ""; - str = str.substr(pos); - - pos = str.find_last_not_of(" \t"); - if (pos == str.npos) - return str; - return str.substr(0, pos + 1); -} - -} // namespace mold diff --git a/common/integers.h b/common/integers.h deleted file mode 100644 index 2ad02d0c..00000000 --- a/common/integers.h +++ /dev/null @@ -1,221 +0,0 @@ -// This file defines integral types for file input/output. We need to use -// these types instead of the plain integers (such as uint32_t or int32_t) -// when reading from/writing to an mmap'ed file area for the following -// reasons: -// -// 1. mold is always a cross linker and should not depend on what host it -// is running on. Users should be able to run mold on a big-endian -// SPARC machine to create a little-endian RV64 binary, for example. -// -// 2. Even though data members in all ELF data strucutres are naturally -// aligned, they are not guaranteed to be aligned on memory. Because -// archive file (.a file) aligns each member only to a 2 byte boundary, -// anything larger than 2 bytes may be unaligned in an mmap'ed memory. -// Unaligned access is an undefined behavior in C/C++, so we shouldn't -// cast an arbitrary pointer to a uint32_t, for example, to read a -// 32-bits value. -// -// The data types defined in this file don't depend on host byte order and -// don't do unaligned access. - -#pragma once - -#include -#include -#include - -#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) -# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -# define __LITTLE_ENDIAN__ 1 -# elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -# define __BIG_ENDIAN__ 1 -# else -# error "unknown host byte order" -# endif -#endif - -namespace mold { - -typedef uint8_t u8; -typedef uint16_t u16; -typedef uint32_t u32; -typedef uint64_t u64; - -typedef int8_t i8; -typedef int16_t i16; -typedef int32_t i32; -typedef int64_t i64; - -template -static inline T bswap(T val) { - switch (sizeof(T)) { - case 2: return __builtin_bswap16(val); - case 4: return __builtin_bswap32(val); - case 8: return __builtin_bswap64(val); - default: __builtin_unreachable(); - } -} - -template -class LittleEndian { -public: - LittleEndian() = default; - LittleEndian(T x) { *this = x; } - - operator T() const { - if constexpr (sizeof(T) == SIZE) { - T x; - memcpy(&x, val, sizeof(T)); - if constexpr (std::endian::native == std::endian::big) - x = bswap(x); - return x; - } else { - static_assert(SIZE == 3); - return (val[2] << 16) | (val[1] << 8) | val[0]; - } - } - - LittleEndian &operator=(T x) { - if constexpr (sizeof(T) == SIZE) { - if constexpr (std::endian::native == std::endian::big) - x = bswap(x); - memcpy(val, &x, sizeof(T)); - } else { - static_assert(SIZE == 3); - val[2] = x >> 16; - val[1] = x >> 8; - val[0] = x; - } - return *this; - } - - LittleEndian &operator++() { - return *this = *this + 1; - } - - LittleEndian operator++(int) { - T ret = *this; - *this = *this + 1; - return ret; - } - - LittleEndian &operator--() { - return *this = *this - 1; - } - - LittleEndian operator--(int) { - T ret = *this; - *this = *this - 1; - return ret; - } - - LittleEndian &operator+=(T x) { - return *this = *this + x; - } - - LittleEndian &operator-=(T x) { - return *this = *this - x; - } - - LittleEndian &operator&=(T x) { - return *this = *this & x; - } - - LittleEndian &operator|=(T x) { - return *this = *this | x; - } - -private: - u8 val[SIZE]; -}; - -using il16 = LittleEndian; -using il32 = LittleEndian; -using il64 = LittleEndian; -using ul16 = LittleEndian; -using ul24 = LittleEndian; -using ul32 = LittleEndian; -using ul64 = LittleEndian; - -template -class BigEndian { -public: - BigEndian() = default; - BigEndian(T x) { *this = x; } - - operator T() const { - if constexpr (sizeof(T) == SIZE) { - T x; - memcpy(&x, val, sizeof(T)); - if constexpr (std::endian::native == std::endian::little) - x = bswap(x); - return x; - } else { - static_assert(SIZE == 3); - return (val[0] << 16) | (val[1] << 8) | val[2]; - } - } - - BigEndian &operator=(T x) { - if constexpr (sizeof(T) == SIZE) { - if constexpr (std::endian::native == std::endian::little) - x = bswap(x); - memcpy(val, &x, sizeof(T)); - } else { - static_assert(SIZE == 3); - val[0] = x >> 16; - val[1] = x >> 8; - val[2] = x; - } - return *this; - } - - BigEndian &operator++() { - return *this = *this + 1; - } - - BigEndian operator++(int) { - T ret = *this; - *this = *this + 1; - return ret; - } - - BigEndian &operator--() { - return *this = *this - 1; - } - - BigEndian operator--(int) { - T ret = *this; - *this = *this - 1; - return ret; - } - - BigEndian &operator+=(T x) { - return *this = *this + x; - } - - BigEndian &operator-=(T x) { - return *this = *this - x; - } - - BigEndian &operator&=(T x) { - return *this = *this & x; - } - - BigEndian &operator|=(T x) { - return *this = *this | x; - } - -private: - u8 val[SIZE]; -}; - -using ib16 = BigEndian; -using ib32 = BigEndian; -using ib64 = BigEndian; -using ub16 = BigEndian; -using ub24 = BigEndian; -using ub32 = BigEndian; -using ub64 = BigEndian; - -} // namespace mold diff --git a/common/main.cc b/common/main.cc deleted file mode 100644 index 98598802..00000000 --- a/common/main.cc +++ /dev/null @@ -1,169 +0,0 @@ -#include "common.h" -#include "config.h" - -#include -#include -#include -#include - -#ifdef USE_SYSTEM_MIMALLOC -#include -#endif - -#ifdef __FreeBSD__ -# include -# include -#endif - -namespace mold { - -std::string mold_version_string = MOLD_VERSION; - -namespace elf { -int main(int argc, char **argv); -} - -namespace macho { -int main(int argc, char **argv); -} - -static std::string get_mold_version() { - std::string name = MOLD_IS_SOLD ? "mold (sold) " : "mold "; - if (mold_git_hash.empty()) - return name + MOLD_VERSION + " (compatible with GNU ld)"; - return name + MOLD_VERSION + " (" + mold_git_hash + "; compatible with GNU ld)"; -} - -void cleanup() { - if (output_tmpfile) - unlink(output_tmpfile); -} - -std::string errno_string() { - // strerror is not thread-safe, so guard it with a lock. - static std::mutex mu; - std::scoped_lock lock(mu); - return strerror(errno); -} - -// Returns the path of the mold executable itself -std::string get_self_path() { -#ifdef __FreeBSD__ - // /proc may not be mounted on FreeBSD. The proper way to get the - // current executable's path is to use sysctl(2). - int mib[4]; - mib[0] = CTL_KERN; - mib[1] = KERN_PROC; - mib[2] = KERN_PROC_PATHNAME; - mib[3] = -1; - - size_t size; - sysctl(mib, 4, NULL, &size, NULL, 0); - - std::string path; - path.resize(size); - sysctl(mib, 4, path.data(), &size, NULL, 0); - return path; -#else - return std::filesystem::read_symlink("/proc/self/exe").string(); -#endif -} - -// mold mmap's an output file, and the mmap succeeds even if there's -// no enough space left on the filesystem. The actual disk blocks are -// not allocated on the mmap call but when the program writes to it -// for the first time. -// -// If a disk becomes full as a result of a write to an mmap'ed memory -// region, the failure of the write is reported as a SIGBUS or structured -// exeption with code EXCEPTION_IN_PAGE_ERROR on Windows. This -// signal handler catches that signal and prints out a user-friendly -// error message. Without this, it is very hard to realize that the -// disk might be full. -#ifdef _WIN32 - -static LONG WINAPI vectored_handler(_EXCEPTION_POINTERS *exception_info) { - static std::mutex mu; - std::scoped_lock lock{mu}; - - PEXCEPTION_RECORD exception_record = exception_info->ExceptionRecord; - ULONG_PTR *exception_information = exception_record->ExceptionInformation; - if (exception_record->ExceptionCode == EXCEPTION_IN_PAGE_ERROR && - (ULONG_PTR)output_buffer_start <= exception_information[1] && - exception_information[1] < (ULONG_PTR)output_buffer_end) { - - const char msg[] = "mold: failed to write to an output file. Disk full?\n"; - (void)!write(_fileno(stderr), msg, sizeof(msg) - 1); - } - - cleanup(); - _exit(1); -} - -void install_signal_handler() { - AddVectoredExceptionHandler(0, vectored_handler); -} - -#else - -static void sighandler(int signo, siginfo_t *info, void *ucontext) { - static std::mutex mu; - std::scoped_lock lock{mu}; - - switch (signo) { - case SIGSEGV: - case SIGBUS: - if (output_buffer_start <= info->si_addr && - info->si_addr < output_buffer_end) { - const char msg[] = "mold: failed to write to an output file. Disk full?\n"; - (void)!write(STDERR_FILENO, msg, sizeof(msg) - 1); - } - break; - case SIGABRT: { - const char msg[] = - "mold: aborted\n" - "mold: If mold failed due to a spurious failure of pthread_create, " - "it's likely because of https://github.com/oneapi-src/oneTBB/pull/824. " - "You should ensure that you are using 2021.9.0 or newer version of libtbb.\n"; - (void)!write(STDERR_FILENO, msg, sizeof(msg) - 1); - break; - } - } - - _exit(1); -} - -void install_signal_handler() { - struct sigaction action; - action.sa_sigaction = sighandler; - sigemptyset(&action.sa_mask); - action.sa_flags = SA_SIGINFO; - - sigaction(SIGABRT, &action, NULL); - sigaction(SIGINT, &action, NULL); - sigaction(SIGTERM, &action, NULL); - sigaction(SIGBUS, &action, NULL); -} - -#endif - -i64 get_default_thread_count() { - // mold doesn't scale well above 32 threads. - int n = tbb::global_control::active_value( - tbb::global_control::max_allowed_parallelism); - return std::min(n, 32); -} - -} // namespace mold - -int main(int argc, char **argv) { - mold::mold_version = mold::get_mold_version(); - -#if MOLD_IS_SOLD - std::string cmd = mold::filepath(argv[0]).filename().string(); - if (cmd == "ld64" || cmd.starts_with("ld64.")) - return mold::macho::main(argc, argv); -#endif - - return mold::elf::main(argc, argv); -} diff --git a/common/output-file-unix.h b/common/output-file-unix.h deleted file mode 100644 index 310d180c..00000000 --- a/common/output-file-unix.h +++ /dev/null @@ -1,144 +0,0 @@ -#include "common.h" - -#include -#include -#include -#include -#include - -namespace mold { - -inline u32 get_umask() { - u32 orig_umask = umask(0); - umask(orig_umask); - return orig_umask; -} - -template -static std::pair -open_or_create_file(Context &ctx, std::string path, i64 filesize, i64 perm) { - std::string tmpl = filepath(path).parent_path() / ".mold-XXXXXX"; - char *path2 = (char *)save_string(ctx, tmpl).data(); - - i64 fd = mkstemp(path2); - if (fd == -1) - Fatal(ctx) << "cannot open " << path2 << ": " << errno_string(); - - // Reuse an existing file if exists and writable because on Linux, - // writing to an existing file is much faster than creating a fresh - // file and writing to it. - if (ctx.overwrite_output_file && rename(path.c_str(), path2) == 0) { - ::close(fd); - fd = ::open(path2, O_RDWR | O_CREAT, perm); - if (fd != -1 && !ftruncate(fd, filesize) && !fchmod(fd, perm & ~get_umask())) - return {fd, path2}; - - unlink(path2); - fd = ::open(path2, O_RDWR | O_CREAT, perm); - if (fd == -1) - Fatal(ctx) << "cannot open " << path2 << ": " << errno_string(); - } - - if (ftruncate(fd, filesize)) - Fatal(ctx) << "ftruncate failed: " << errno_string(); - - if (fchmod(fd, (perm & ~get_umask())) == -1) - Fatal(ctx) << "fchmod failed: " << errno_string(); - return {fd, path2}; -} - -template -class MemoryMappedOutputFile : public OutputFile { -public: - MemoryMappedOutputFile(Context &ctx, std::string path, i64 filesize, i64 perm) - : OutputFile(path, filesize, true) { - std::tie(this->fd, output_tmpfile) = - open_or_create_file(ctx, path, filesize, perm); - - this->buf = (u8 *)mmap(nullptr, filesize, PROT_READ | PROT_WRITE, - MAP_SHARED, this->fd, 0); - if (this->buf == MAP_FAILED) - Fatal(ctx) << path << ": mmap failed: " << errno_string(); - - mold::output_buffer_start = this->buf; - mold::output_buffer_end = this->buf + filesize; - } - - ~MemoryMappedOutputFile() { - if (fd2 != -1) - ::close(fd2); - } - - void close(Context &ctx) override { - Timer t(ctx, "close_file"); - - if (!this->is_unmapped) - munmap(this->buf, this->filesize); - - if (this->buf2.empty()) { - ::close(this->fd); - } else { - FILE *out = fdopen(this->fd, "w"); - fseek(out, 0, SEEK_END); - fwrite(&this->buf2[0], this->buf2.size(), 1, out); - fclose(out); - } - - // If an output file already exists, open a file and then remove it. - // This is the fastest way to unlink a file, as it does not make the - // system to immediately release disk blocks occupied by the file. - fd2 = ::open(this->path.c_str(), O_RDONLY); - if (fd2 != -1) - unlink(this->path.c_str()); - - if (rename(output_tmpfile, this->path.c_str()) == -1) - Fatal(ctx) << this->path << ": rename failed: " << errno_string(); - output_tmpfile = nullptr; - } - -private: - int fd2 = -1; -}; - -template -std::unique_ptr> -OutputFile::open(Context &ctx, std::string path, i64 filesize, i64 perm) { - Timer t(ctx, "open_file"); - - if (path.starts_with('/') && !ctx.arg.chroot.empty()) - path = ctx.arg.chroot + "/" + path_clean(path); - - bool is_special = false; - if (path == "-") { - is_special = true; - } else { - struct stat st; - if (stat(path.c_str(), &st) == 0 && (st.st_mode & S_IFMT) != S_IFREG) - is_special = true; - } - - OutputFile *file; - if (is_special) - file = new MallocOutputFile(ctx, path, filesize, perm); - else - file = new MemoryMappedOutputFile(ctx, path, filesize, perm); - -#ifdef MADV_HUGEPAGE - // Enable transparent huge page for an output memory-mapped file. - // On Linux, it has an effect only on tmpfs mounted with `huge=advise`, - // but it can make the linker ~10% faster. You can try it by creating - // a tmpfs with the following commands - // - // $ mkdir tmp - // $ sudo mount -t tmpfs -o size=2G,huge=advise none tmp - // - // and then specifying a path under the directory as an output file. - madvise(file->buf, filesize, MADV_HUGEPAGE); -#endif - - if (ctx.arg.filler != -1) - memset(file->buf, ctx.arg.filler, filesize); - return std::unique_ptr(file); -} - -} // namespace mold diff --git a/common/output-file-win32.h b/common/output-file-win32.h deleted file mode 100644 index a9dd0005..00000000 --- a/common/output-file-win32.h +++ /dev/null @@ -1,24 +0,0 @@ -#include "common.h" - -#include -#include -#include - -namespace mold { - -template -std::unique_ptr> -OutputFile::open(Context &ctx, std::string path, i64 filesize, i64 perm) { - Timer t(ctx, "open_file"); - - if (path.starts_with('/') && !ctx.arg.chroot.empty()) - path = ctx.arg.chroot + "/" + path_clean(path); - - OutputFile *file = new MallocOutputFile(ctx, path, filesize, perm); - - if (ctx.arg.filler != -1) - memset(file->buf, ctx.arg.filler, filesize); - return std::unique_ptr>(file); -} - -} // namespace mold diff --git a/common/output-file.h b/common/output-file.h deleted file mode 100644 index 63299ed9..00000000 --- a/common/output-file.h +++ /dev/null @@ -1,5 +0,0 @@ -#if _WIN32 -# include "output-file-win32.h" -#else -# include "output-file-unix.h" -#endif diff --git a/common/uuid.cc b/common/uuid.cc deleted file mode 100644 index 89b87b72..00000000 --- a/common/uuid.cc +++ /dev/null @@ -1,20 +0,0 @@ -#include "common.h" - -#include - -namespace mold { - -std::array get_uuid_v4() { - std::array bytes; - - std::random_device rand; - u32 buf[4] = { rand(), rand(), rand(), rand() }; - memcpy(bytes.data(), buf, 16); - - // Indicate that this is UUIDv4 as defined by RFC4122. - bytes[6] = (bytes[6] & 0b00001111) | 0b01000000; - bytes[8] = (bytes[8] & 0b00111111) | 0b10000000; - return bytes; -} - -} // namespace mold diff --git a/dist.sh b/dist.sh index 4ff24f0e..1137feaa 100755 --- a/dist.sh +++ b/dist.sh @@ -33,6 +33,7 @@ # $ docker run --rm --privileged multiarch/qemu-user-static --reset -p yes set -e -x +cd "$(dirname $0)" usage() { echo "Usage: $0 [ x86_64 | aarch64 | arm | riscv64 | ppc64le | s390x ]" @@ -42,7 +43,11 @@ usage() { case $# in 0) arch=$(uname -m) - [[ $arch = arm* ]] && arch=arm + if [ $arch = arm64 ]; then + arch=aarch64 + elif [[ $arch = arm* ]]; then + arch=arm + fi ;; 1) arch="$1" @@ -53,43 +58,52 @@ esac echo "$arch" | grep -Eq '^(x86_64|aarch64|arm|riscv64|ppc64le|s390x)$' || usage -image=mold-builder-$arch -version=$(sed -n 's/^project(mold VERSION \(.*\))/\1/p' $(dirname $0)/CMakeLists.txt) +version=$(sed -n 's/^project(mold VERSION \(.*\))/\1/p' CMakeLists.txt) dest=mold-$version-$arch-linux +if [ "$GITHUB_REPOSITORY" = '' ]; then + image=mold-builder-$arch + docker_build="docker build --platform linux/$arch -t $image -" +else + # If this script is running on GitHub Actions, we want to cache + # the created Docker image in GitHub's Docker repostiory. + image=ghcr.io/$GITHUB_REPOSITORY/mold-builder-$arch + docker_build="docker buildx build --platform linux/$arch -t $image --push --cache-to type=inline --cache-from type=registry,ref=ghcr.io/$GITHUB_REPOSITORY/mold-builder-$arch -" +fi + # Create a Docker image. case $arch in x86_64) # Debian 8 (Jessie) released in April 2015 - cat < /etc/apt/apt.conf.d/80-retries && \ +RUN sed -i -e '/^deb/d' -e 's/^# deb /deb [trusted=yes] /g' /etc/apt/sources.list && \ + echo 'Acquire::Retries "10"; Acquire::http::timeout "10"; Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/80-retries && \ apt-get update && \ - apt-get install -y --force-yes --no-install-recommends wget bzip2 file make autoconf gcc g++ libssl-dev && \ + apt-get install -y --no-install-recommends wget bzip2 file make autoconf gcc g++ libssl-dev && \ rm -rf /var/lib/apt/lists # Build CMake 3.27 -RUN mkdir -p /build/cmake && \ - cd /build/cmake && \ +RUN mkdir /build && \ + cd /build && \ wget -O- --no-check-certificate https://cmake.org/files/v3.27/cmake-3.27.7.tar.gz | tar xzf - --strip-components=1 && \ - ./bootstrap --parallel=$(nproc) && \ - make -j$(nproc) && \ + ./bootstrap --parallel=\$(nproc) && \ + make -j\$(nproc) && \ make install && \ rm -rf /build # Build GCC 10 -RUN mkdir -p /build/gcc && \ - cd /build/gcc && \ - wget -O- http://ftp.gnu.org/gnu/gcc/gcc-10.5.0/gcc-10.5.0.tar.gz | tar xzf - --strip-components=1 && \ +RUN mkdir /build && \ + cd /build && \ + wget -O- --no-check-certificate https://ftpmirror.gnu.org/gnu/gcc/gcc-10.5.0/gcc-10.5.0.tar.gz | tar xzf - --strip-components=1 && \ mkdir isl gmp mpc mpfr && \ - wget -O- http://gcc.gnu.org/pub/gcc/infrastructure/isl-0.18.tar.bz2 | tar xjf - --strip-components=1 -C isl && \ - wget -O- http://ftp.gnu.org/gnu/gmp/gmp-6.1.2.tar.bz2 | tar xjf - --strip-components=1 -C gmp && \ - wget -O- http://ftp.gnu.org/gnu/mpc/mpc-1.2.1.tar.gz | tar xzf - --strip-components=1 -C mpc && \ - wget -O- http://ftp.gnu.org/gnu/mpfr/mpfr-4.1.0.tar.gz | tar xzf - --strip-components=1 -C mpfr && \ + wget -O- --no-check-certificate https://gcc.gnu.org/pub/gcc/infrastructure/isl-0.18.tar.bz2 | tar xjf - --strip-components=1 -C isl && \ + wget -O- --no-check-certificate https://ftpmirror.gnu.org/gnu/gmp/gmp-6.1.2.tar.bz2 | tar xjf - --strip-components=1 -C gmp && \ + wget -O- --no-check-certificate https://ftpmirror.gnu.org/gnu/mpc/mpc-1.2.1.tar.gz | tar xzf - --strip-components=1 -C mpc && \ + wget -O- --no-check-certificate https://ftpmirror.gnu.org/gnu/mpfr/mpfr-4.1.0.tar.gz | tar xzf - --strip-components=1 -C mpfr && \ ./configure --prefix=/usr --enable-languages=c,c++ --disable-bootstrap --disable-multilib && \ - make -j$(nproc) && \ + make -j\$(nproc) && \ make install && \ ln -sf /usr/lib64/libstdc++.so.6 /usr/lib/x86_64-linux-gnu/libstdc++.so.6 && \ rm -rf /build @@ -97,11 +111,20 @@ EOF ;; aarch64 | arm | ppc64le | s390x) # Debian 10 (Bullseye) released in July 2019 - cat < /etc/apt/apt.conf.d/80-retries && \ +RUN sed -i -e '/^deb/d' -e 's/^# deb /deb [trusted=yes] /g' /etc/apt/sources.list && \ + echo 'Acquire::Retries "10"; Acquire::http::timeout "10"; Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/80-retries && \ apt-get update && \ apt-get install -y --no-install-recommends build-essential gcc-10 g++-10 cmake && \ ln -sf /usr/bin/gcc-10 /usr/bin/cc && \ @@ -110,11 +133,12 @@ RUN sed -i -e '/^deb/d' -e 's/^# //g' /etc/apt/sources.list && \ EOF ;; riscv64) - # snapshot.debian.org is not available for RISC-V binaries - cat < /etc/apt/apt.conf.d/80-retries && \ + apt-get update && \ apt-get install -y --no-install-recommends build-essential gcc-12 g++-12 cmake && \ ln -sf /usr/bin/gcc-12 /usr/bin/cc && \ ln -sf /usr/bin/g++-12 /usr/bin/c++ && \ @@ -123,18 +147,28 @@ EOF ;; esac +# Source tarballs available on GitHub don't contain .git history. +# Clone the repo if missing. +[ -d .git ] || git clone --branch v$version --depth 1 --bare https://github.com/rui314/mold .git + # We use the timestamp of the last Git commit as the file timestamp # for build artifacts. timestamp="$(git log -1 --format=%ci)" # Build mold in a container. -docker run --platform linux/$arch -i --rm -v "$(realpath $(dirname $0)):/mold" $image \ - bash -c "mkdir -p /build/mold && -cd /build/mold && -cmake -DCMAKE_BUILD_TYPE=Release -DMOLD_MOSTLY_STATIC=On /mold && -cmake --build . -j$(nproc) && -ctest -j$(nproc) && -cmake --install . --prefix $dest --strip && -find $dest -print | xargs touch --no-dereference --date='$timestamp' && -find $dest -print | sort | tar -cf - --no-recursion --files-from=- | gzip -9nc > /mold/$dest.tar.gz && -chown $(id -u):$(id -g) /mold/$dest.tar.gz" +docker run --platform linux/$arch -i --rm -v "$(pwd):/mold" $image bash -c " +set -e +mkdir /build +cd /build +cmake -DCMAKE_BUILD_TYPE=Release -DMOLD_MOSTLY_STATIC=On /mold +cmake --build . -j\$(nproc) +mv mold mold2 +./mold2 -run cmake --build . -j\$(nproc) +ctest -j\$(nproc) +cmake --install . --prefix $dest --strip +find $dest -print | xargs touch --no-dereference --date='$timestamp' +find $dest -print | sort | tar -cf - --no-recursion --files-from=- | gzip -9nc > /mold/$dest.tar.gz +cp mold /mold +chown $(id -u):$(id -g) /mold/$dest.tar.gz /mold/mold +sha256sum /mold/$dest.tar.gz +" diff --git a/docs/chart.svg b/docs/chart.svg new file mode 100644 index 00000000..e2f71c0c --- /dev/null +++ b/docs/chart.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/comparison.png b/docs/comparison.png deleted file mode 100644 index aece81590e93faa0081824189e22087a31d4d71e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 72057 zcmeFZXH*mK*EWhEpdg~s1O+T0(vjY)fPhpX^j@Tf8mfRuQxH&)E=6h}A@mliAfWUX zdJ&NpdJQB5&Y-`4d7tyHb-tXBPu5B@%*;JA+4sHMwXe-PEe&M~G6pgN0s;z^XHRtq z2rk132ngq{kN|rkGv3t`5Rm6}Q;A~4k@a$b;GO6wh2imR{%_~c!={Na{7dGrdZud;@Py|z#$p}9-`;6ZlmD)j9e+s9T4ToWW6bEGym zw8y@_p%B8oX6^0GoRta~IO)ePEeBy5Fd3MP{a^yE9I;y}BybYh0#RiK`rpcNn$PkU zv6rsiCSVvOwz;nV$WrNP*rjie@aCKCrUWr3#@^P|XZZ7TM+V8}_v8eh!raI-JNus2 zF&Z-0-Ez2200pstne1GWFwWs%kW~OBI_qNm%N?8Xwet=6gZ({%Kw+8FxluIgS_qfhRBn26u+kPCXTLc;NK;1k}# z?51(Oi1KYl_~#4(l#F@r=VJC~{iDuB9O?6=Z;$s`#e%vkS?F(ezY0?Nz|YeYqqEf~ zXVzcTbhG*$)TZ zSGm0C(Gg6tO@sFiXt9L37X0w5>+Q=s*HSJCENF=`WKxpY`dUn+_`WAdh-B~7 zwK%d$fhv|{hL*h<;9AJ!%?f0);POzj!P%la{1!BDd(_j9gf{wtj?#S^eHJztI< zy<)vIJ#cX&h!vVv#IHvOrGWD=FjE_rWVM=h{edbY*s}$h!h5nW+FCW zed9bs;L+w>ykaDRDmwvDsZyx8@{Y%7+{gz3&fK~_OO8|+5h@JOSkd534f4V&NEr+m z@#?si&w-nq{2yXTuyWX2co*!-LF6@il4cSIk|mNt3KEK#My%_nO-(eHFlhahu08K=d|X`{VHee81CYqt{B^^p&5&^OaxcYq9cTy zH?6p<5H-zUqUpM+x6U=wUei|2vo0Mq)is!n@7~WFOd9+f%mUH^Qv$yPM&sA%3j}Vx zbWR%*TWko8R)}b4id2qeh`k>h9_te8ngP$K(?Ds|X0Z2%^!H>KXUz9jrT)mOw_XmU zEADYlM)5H6%$v-bQ{$<`sGh0&9d z6w=gE{N2;MAoU<-Lne4W+&lPF@Q2`c!TPV2WS`6GwT{Rx%F0;;{&f4GHux+1$l|%1 z{Z7Hp0OvsAWYaFD>+a-+3?9;-?lRCA(O|iH#h){b3E#XT`YJT=0C4aF&>! zT!UNJ^t+)i7k$Fr^WvTt z4|X{Nr1*6pdMLU+S|aH-w_$yr54-0Y!gjFYlPzP9dJ-M?w)xE}t8>*~-3N&+wBpoB zG|lETwZ$dRTI?0I=PBi6M(JSGYSG&zMf=T$)zf1^ybFmY?2yMLkMYRZ zhOO?nSQ*e6)F2Dl>=e2l!FZ?qPRE^X@ZMxq%kw69Hr#c8qpxr2U@v|TV@wOKH+fQN zwF3GDIvLCCF6$n1DVP=p;Co|*nI}H*(n0=}80P$eEQ>RjQI*k-$!=4HNwBn%uL3Y zZ|!G{!daxzoec(WpUfj{3(xl4(eP$OG80K+c7UvY59xpFj9I8L| ztQ{DTnupv!Xv>cdp&NkkOAMIoA-(o0x|SC8Z|fT&kY+B;%N`|b`pM?$NRx)N(gsu4 zriGx~k^m59EnQkaTdCkYv2b`M?H)Y})alA}6lD_+Vn1$6`nz$!% zYMD@}BHv<6)sYBno*}>_K;saLF$pEj1idkdH?jiB>uyR z<$?7NwhT?2jDWOH&^sJIUmAu6Q@xjts;$}nGSAG(PeXQ9xtl(j@UhL;q9TIE*~r!3 zmkFS6BkuL54^A$UsK2`Np6Xar@=HfoAI)o$Z!sz?APIuVkG9Q(1bXHKoxioKcwc8^ zB(9r3QiFJ>K^2vBA1x=y?`kj;nA{KGua~1cb{eig@Mdwkm z{T}`;@y~7HFZgr8#d#?$?;W~xO{fo3^IiAQc_8SGvNce#Q&%V80Jg6XT)xCWKn!eM z0^ZV>?)-aO>C$5YqCfiyfjHTL;POA`XaL_AuMfccV$9!fqWA~`QsA#!z}q*Q@Ly+N zhG!G~YnyNmI7T4%LP13Z_^!-2n~9@u+%xry`g z`uO_OkTl275mIdy#)#=c%oywTFY7mxC+# z-o+qjDfj$5ES-=nEy*R@Agols!-`56)N?z<0*K+W+ zbvAtJ-~z}DxQDcefS}}`^Z#E*{&UCw8foyKkx!n83IF%d|2p(Phw6FSdMLQM0QdBg z{?B~%<`_aqkAq0e1?!Ep$ zKP-r^z(sHT&oLL*;b)b#NGw8ISpVM+0)t5YZ~Omu&_5IKzk>K*LHwWE$j|!WU901d zc-G}<*A{hRyHwkxCp|n3@RNl|=HAnjeb04+ppzx_kpgwkxcX4pKZM-O-MnW~yWXmh}tL`oE{S+}hxRfshtV5`21%dByNCooRx==+R& z!Z5hf;IGkfykhg9%pbQ|+xPwBLnmhwu}7;t3#}nUeWpIMwq|}yi8Bv^F%9#nCAD+n zm=uX!!pfHeHRhj#PI}Bc2fhi=&>Zh}G4~0Ln^;Sq?zDeD^j;nOdayYyvM2WF5mntn z#2t_O;byyCGH2yVTH1XBi5{g5Gd*erpYip}y{Xnb^}pk1BA5aXh6=2^V(Yt>nmxY~ z)BVQ3_Uxp8nUbh)RzCy#F}g=R9C*6hRq&!vBXtUa1ot-j?QdWxYd_odrKPr<9hPoc zqFfIZy29zqQl_tC0@+3GFZxhl*2t8cbOs(lv9{`yUo;P#P~h4fVJ-~D^!~}0NG79@ z>I$;IGN@n!W?9u-6pUOe(Dj<3E0fr69BIU&VfBz321}2vwxQuaQl41%Rt`uz)lAym zsl;tHboKX8CDI(P$ecTjq8}o(%-#8a6q*O)4{Hw70>U+Jd;R*E>==aj6fnDFmbtB) zwN1$=J{rM36?wEdoo*c`C5)N{M0jR3%F&fLqA4>AujuC3vJ7nqJXuQa!(--ML8cJB zk(S_dlIr03;PWQ2^V5A#Tplfm&eThL%e{L06gPj~Kh1;*ESj25w80D5_43uudZ!Ob zn*b}JM#ySrT{Tq%OrMx}cCwO{-h$tBtw%8h9WU559gg)#EnolZR$>%iw-%R=dcDIH zB}qI~djB2r?g0V&O}>svHuW?cr5FZ1HwscF-|V%aT;;ERz7=HzAv7gg=C5`YJ-3@M z%hx}RXh`%`foH`rqA?ONtYtbWxuBd%yS|F|PVjP)5IqJ`-4NTc@m7x0uLW8bdfko0 z(Bi#OMxPnWN6%9%#%FUB$vykUR&G}h=l=C_gDIE!ZyE&d{a{nVA4b!}j4#sRFYzAr zR3dc_)<(Jp{acecgU-#*vB@r`OUo=}9=A{}6fLivqJwbL^HX3|hJ7XLc>7Zhypcvx z6G4dH;&piQ$?~0P?6(-1s@={z=|tB>vyVu1+w_H#T2)O6Q2u>1gq&>J_0w1egQ9eJ zfVnE{&x0l=Xx2QcuF|E`sVz;!UQ?}s^oIX&!?Ly)Pslo}7o(W}M1r*u;ig^_@_NGmaFjMlX{)4 z8_j7row+p5fnh;JPN%-t(njS#4V{o%+z67jOjWIBuSaiir;VKVH;K?Vu?4lbP5uo7 z+x$nrY8OLDnfL}|&R>Be5;UbJt6Zjq0rmVFDhPUoC{Y;*9Yk?q&Tbkc8dtsQxg+UY zb^=`6gZw?iHAxJMk0bX7ma^i*ww|3f4^oirL>@`y+j`HHv}oRD(OQ{A=Sd)tuwK#V zCcF${CrY;5T@ba$mg_AVYX6NO?74sgG5G~*Gn$03TVFXu8%V5Bui=Q#Zf#fXN~*^^ zDC@Zs0P&~ZTVJpmRmA5fn{~UKykpC>4?2XIFEJJRMYeLNn35lHtPikv6|#HBvkxDxa(J;1XwdMS zd4h4#mjiKXV70(>fa`Fkc#PR|8Dz)8B!_Ih|@cqU6Y?>W4W zz0N59X@&+cUBOjEBaU!h{7C6Jth5>FLW?Mm>Yl9DUJP?M&jjpdBWw_|Hr%mMw;1b9 zmu%$7Ub&p+gl#ZhJnKyr*c4v(KH8X6T8t^8Z!jwi{t_5N{7Lj{sYJ};@3dYg^HByB zNFgWjniwiDUD%};yI$(LUjI!m)dLmT>E^CzUBcaRx_4yYw_Z}av0JdH+LdDD1LfRk z3n$;)Ko{pNPSTj}4$GvDIeDEb(J)FBQ>CtE#k%OT`OoqO?M6eB0ng(N)i8BmAE|)! z_V{K{!E z2dM2M8n%$crL;6Wu)#%^5WV<;MBe7(2R^%mF`UbDrBjp;u@U#8fk%umaj;%8l3SR> zu!tM3J8x8IZZhj&6d$-d|8ilawaQ_nw_&^438R?hS4_1TEuHHn`N`;a4x$u$4ds8B zidi>5pE~mt7WY2*tXe0Pj~M7Hbs0~b)n(?uZs@%IlxJ`3bqec0)YLaZt;Bp@`hs)c zR@C=2zWD0o2e6fwP z_8A2n{id{p2UU{#AFGnW&gEiur<_V%g6|e> z&Tk_}Q_WzDdL7KT=W&O*F>~h}Jl|EsOJ6Oj3Z0E9yTqMIJ8r1z3(vputlVzKAtPrV z{0Za-vxynhEfRnK=DFL8jaTC-XTyw}`j~Eo2}m*=usWRMF$1O_)!71TXFQ_uIduK& zOq|CjeXc<-4!$Yw%iuOD?G9Flw(>vLiu=8w+C|n>4E@+Je3?O0AM2JwLKuTIifR6d z?iR-8m_i?iO$KZ?JxtbKg}4udLkf5MtGGpg=h^xu@T#OH7*HBv1}$j^cKyKb%0wN= zidpaJRE-RAdPXO0#p&XNd6s)7%!>(2K@ySFmj(0&-mm4-~((Bg|4ZVO)p&yOcp`HYOyQzFT2A{JBXyG)h zrk?NqtB|&Ah3jJ9GRqTJvc+@8Mn7;4>VMKCAnjez+x#cBtuy>Zxmd9 zYaC@bR_sGu7T!&3>}+{7<2hhr!6dY;E;4KLNHgA!pa zYu${>du4d#%bb9=;v)5$fEN%KYHO|(I7l2k3`aL06c2y8#x<**&Y$D6>hVX@E*5mP zuOMm?$3H`<&hb3T8R6Dq9WK*J#$bnb&fu9s`F(W2Gu@)Gd4I-Aseq&2X%_tEI1Qoc z5T<{>UxIgNr6_=+_OF*1)xSi@S-RYo@Fe3H#_m1yqa35NA1QPCHIs#soJ z`4esGSM1J1ylX1a8g2oYtqffLleP7kSCFglgQ;n7Lmw2PHJroSg?y8%VYhD7hMW$> z%=Q6-|72f|{Z;y*9WQ_9FvnwD`L@B@<Nr?Z}x7WtqkN6U|SkFN~} zN*nj0C`b}c)yMW|wpP17LzB|{rLV!s0+SMlhc$XYDhvjS z#NFqH@~7w6fKpidSbkVYqBIU=rODkBMN@q+?2{+Y-C6bC17(;`2y;9ynSYrT_&&D1 zEG#%K=S|6ukfhRI?|1*o4FkXG+nMkEG?%u}KT`7xTV2fC>x!VxZ8&F*-Hpu)2#wsd1>g$|-OfIx-zM8~BR`LUz= z#_-j8BRW(*HB2JhT3j0hqkurJsNJs>L_fR+Z`-`e41;B^W2vR-q@~Y}7hT3F)b%r< z;GC%EUNfE5?=X0!B8<&e`ZM}IDl@Gy zjiH4l<+RAZl<6tNe_*?IPjxn>&i`4(78FES>=UkMDbmE!!+9h<(%ccKgcfTe#T~8} zHV8!-4a@=yNvOc-jkLn39xqy8C|J&vh?AJz(QsY0{DXYf*JM*~!x-9pel3`Mli9X$ z#^p%G5_(Ok!%;Vn7+R2!-Lc;r>3~Hp>L6yXK{;*So^LB5HM5lT4@C}E-Yv|Bvt|0W zt(xl&m2<__4e(dL*YW8s3k%UZZag6W$DO0kX#$NMcJ9|6udl<#}RbarYOP&N9fmSCAjaR@M#2 zJ(SyBsEn}^-orglMN<_-GZxLT@TJF@#@~+}*x`K!}U~M2nJ2EyV>DZ1nO@;98 ziBZ*IdbNuS?U5_ohFt|n{7Kf!DKI4Q42bFXmSeE+SaQ+BT4nC31HbH?Mbl{*)k#^H z%;VJ)u0SUH06~pUGPED(2E>JPRCUX8Fri4#A5%>t)Q-i9U(SL~6iuUT8I$Bg66cZ` zG&L71);u4fEBQ>!jsdp^LFwr)G16gAC4asq+9MQhiZt=D4`#%2>5{ijB|X-!<-$`8 zd#`R+Mja{21cjAYEe378Gx8QgKcDshvgF@^2xvglnfKE~o#no%dH>+x10SQ{5tEPk z%!r1ye}x=_m#@GZ8DhlgN0JExwIM{T4U>nk!p<94G~iGUr*?x*9k!asf)`^%rb4u51w}Ms15DDV^Fk7vzoA)%WZ3mYu4UW} z;})Iqep;P3m}17DSzNAGKxh) z&J5?R+WDZfUz>PnlAQkoa8P)yzL}rxoX0Q1?~SQslFtOnbXK!@G>a3;B#$Pn?KURL z{Sr_{&)Gf%u~YMfe~mUP#N@Ye_zwJ@?TdYw;rEYIEu& zjzLUM6K2jh*Vw^Vvi{`IFxMr`O7`{_vePl3jg|#_ZRDk)?jJ-b zTP}1j$l}WUw!f*aMSiw6H;t(iUGRxz54jXwjmB`*pQWOD72f8r{kq6nqsvQI`+HtX zbkIAr)omJswS84bH z#(XtM7n*lM9=I|Xr-6NSq^6sa49ak5$`8|0S?T*V?>}#VfjA#?&mQ2l=O42FlNj1u zxP0$ijZVR{aGc_FI%+Nk`wO%h#(Me&kw6)EX}71$vVG48%>Go{Cjv@kj1dnE8B&SG z4a3&Zw{j5otGQK(U{IcyU5q}TU4(S(XBUzU`(T`&iWvvAMzH}r#ajpKWk%1wB|?)d z7bcV)P|CU%iFt@>UKK)4Vs~*rj9R=FAIIvI_IPNksx=sDt77tw3_hA8Z#t3_>q1<9 zcCXS1p|MPTt1D(8w@h6>(Q^sYa^5uQT-i6%9`8Cr2Cr&y)HZ+~>fbhcO3dbdDWUUZ znBZrS_WIq7nWw}|;O4re`V|N;r<@1$^oMC>9ofr6Dh71bU6I=b6|db@5p;RdCPCd<^ydKcSkp2p?w-30!D=;~nXjeU4cgey zgz5nz7tdo1A0j-l(|vTM%si)!?c8s6{C0$+%Tm;wYY;h&dK?)%o@<)Ba>Av&RaGv< z=qoY@2{P@hrX8O5Wy}>ntCqp0fB8C)YxEpxON)#lgHsct_t_DvUE=tXqez)6#)kQZ zFQ*WB_jTF#?RH%Q4M@jLJk{so5|EE0jm}^ElYV~((r>fP2sUeRxyN3pMktj6Bp?b0 z5p*6d(OakUX<7aDjaR$n6K0=tyxK6tBrp24EF!d>U->MC7L)+mc@LdDaum zvp?n^+(+b6-!;ax2pE_jx`A+cGNFw`=YR7D75*;Y4o z``YqaZuFgsZfoAx&35-}FC7@jqoeI7bSbXD=_@x=aGHw<&|se3Nfb$u_w`~)}sHUjw({NhC4 z%DfUWs6+m0jM@V>{Tw6u`V}u&5~t(WU#hpO>eo~*6e?;nvWE9sc5YiOpT+RIMr8G| zB3pb1f?;KuGMTPAV*%G2+)}p`R!cTV-CBPuwt*^W>?uS6QhMl`8$5;f zgd9hqjqa2*Ip~S!I$n~;v&|^hu@+U^+XmrXcJUDEv_b=m{Bg5@&FbY)O6l*$?_;GH zCDuz@nki)PJPUpTW4m&<4OqpNtGnG=1l6}`n#?YyXRn9R!1mC!=?KMZ`K;9XH{fyh zUoJ@h6KrW*1Y5&9HqTqM)|1hIgrR~wiJV(e_FL|wF2|gi!8xU0b4vkCq%pZ8X6U2H z$N=R*zDdg%@)ipJRVw+nu;jaIny;(gL#aAe8*q!XcU6$r!h6(i860j*j@XwS z%Cp|>;<%RA=7t#5+iE)uV?zLmncxT9&#$4Lu*XM4Vd>h|SJuACAfTv3#A^Oe{qv}5 zFLJLt za0_aH=t)2-!INX&j{vK+<&|IcioC%-e3};6i{=JW8(Z<Qr^aWj&NldLbWpYd&h2&$coz9HCz=>^A>n=&3V54)2bnyz zAOGo1j%)A>xH=7puMppL;$<>e%7%v@X?(Wtid3txF@65jdVj4;5ON8j2HZCbFU|MLco`WBOC)$b;A$q>? z42aUvuKY0Ng^GmBAj>OXY>zLQ)b;MTi(XgqeQ>AWeAV5+=d83T+>Gw50v%@=Zx;y6 zsVK>_FD@EDp3X*4^C-=N?x;i@i#FsrPOM&`FKhi~tft=qB#&+&egwT!HRhxShUVC`s~E`zz!HbqN5 zOh(gJ27$6Zo>TW|q$UTZ2ZY;~r3I!O3O@K^%^pW{U=Nr5vl7F_XV&BA9#O!nBu)8# zHRHBmLnR_$Q*mh_Gn-L8w?#854N4*yfa~5i%@9zzYP?yGfH{n()|AWuS)E_H%{h7B zVlbw-XCCXuXz7H~X%-3V`Ss+lQ8)yuAl<3U8E7^gsSdO8;7$Z1YJ~kQKg(fQ1&R*-`_|6J5`$d1i@^zXht;|*bWgOz@^ zXAe2)ZA?RLz9F*%Dd>yKu-xj>EfO6|wS*l%BZDUNbvB9HUcR;K5$N-%dRnbh#MWFSQfqDA%Id9sL$?aR{WaZ0rpE9<@ybu(PzKP{1dIV5);@QRyutKt9OKh_*A28GJ+l!Kwn=oFNXvY`{M zE0jw`ix^Z3->FJ@Bu^;pmU;w|UB_Hkk00?_@hTi^aah@aVLbBVd`l-9T z9fUAAkma5O(c=Xu?YG}2;%Q^-Tpm8;o}9;JRP`O&U^gR+F}w<|igCON+X{3XV$h!p z#Fz$jsJC<&vJ>S;_FPzwiQ&X|(Sea#H(*hl}H?h+sR8 z@>9yy5uD!DqNXR>`kV({3q$w5kr9~Q2MUXiMnV!b#vprZK58THR#RVQrjF-yZK_uT z#tE|k7$B$7Ulm&QM^jFHz}$QgQ8OjvJPCoM1-q`wEZT;YWkr2LV8_YFW4-q7U zS++;ex2(Ty4qMAB8m!Ygqw}nlU?@}|S|6*C#2<_(l}MUQI>JhNfsky8X&Ic$^CNIj zj+EXsty$|TZ$OE;H!43{CE-!H8741HR$p?)>>R)TRy8G=ixubbVg=zRXy^zOn;xB7asw72DDwWNHz#!iQF#~1 zS`*CLN5X4$u+hCQDo`0xx=8ysuS}dW=e1~;6dSn@ZSeRyN3`pv7)s&R0W3}}O z1BY}lw_84Y4XY@b);k)vY2^&ktwDEG*!;Ci@<(sU$$d=PPDZ{r&tD3I6gps8&DcJ) zoG$Sm@|A|Q=&Y+yxV2#B0#^ExK^Nx2fvpW}n?{>yV4udhKyZ-86lHfZv8 zI|l-yTRdi-KyEs z87vrNMUWf$vdVLhGsBJIMy|lCX8u4KZ3D8zEJ&mgU^oxG%;?2D@)*NrvJC9GV9;z~ z$7{7aO*bPV&#;c3Efac#s0_1NG+t*_z$ke85CbyZGb!p3Rnp#;;1<~;ZDhV^btiwOb`$yPzEB3!=dvos{2KEJ$s)h(V=$KssRhEHc_lOVv|j7y9&VKLHMRD zaY(UtG}mUXIW-y`Am^fL0@QETNZC*!NhOEtfpAWLbhG(r{B+A+P5@5{kfA9E(62Q_ z#9nf#oeMyG;&&Z88*2z0bP=^PXP>7(U2^*;Rlq*F&>MpyHfx8fvf*VY>>UZq_ykWr zZQ#NXyeWy-gSDtPl`EM6{Zh|g!KhXiVEaELkZ~_d(fka~RuCZ5*A53qXnDYw|xV-W3c%oj;2YldQgaubbY$!O1*e zQ^*rwMxDF12RGB1u{`+E9JXEQK>~7=%5iy5rm-)4+Cq6Sd6VQenUL3>(R7ZMvS}G! z$NtsbKDz6Cn=LhifiCA4l!tcO;AX&!0vK4yL(VKxaB%w=FBl1K{{RX$Kkz9KR-B!r zX%1s04?zKiAUk8j@Gg(K674v(`V^(nb#h{m_I~7uxX+ezS0hm~11mQf zY?Owtc->;y_IRI0{ARTHB`qhSi~}sU*y&O@W)XQBw$JK!cPvAr6JOr?E9pyuF@DdC zi;vSMP?w`@I0!cVPNTOi$=Cgn_{Xi0Le1Z$%l;IA=zjkrI=})&qY{<5Kb`&})lfNN zeU)`mib-Cd*PLqWZhCZAWgd%aWjS`;@)r4<#p)k!V?yOegF4m8f#Bo$@a|nYl9L{7a=!4W4%Pbg=j$U+ z;Jcr5>ZpY+tUqHq~?`0=4G;nhMW2a!9DxzS}>*j=~M7eu&;gRl^|bRC~Af z3-1j3hFd}rYQFTx^42!?V%P@-PoA4-zwk0$Lul=mKFzhrokqS8!{ZuXChUiBvsSi%j{ zDKoaWmUMC=SXPI`n zB$SKZq4D&sGyYQd-zGk+wY@A^oak80;kbE2@`U@d1E$9SmoQm*Q!O5XTOSlQd9~Z& zx2;CZwq-@ClBAD*nKxFtqkR7Zr?&Og`jwdyG_>Mp!O-SdH(3(gAr#p~cqVUJ=B^FMe zjgp6>FKQa9jk96{7zNz(r?N=Tyo+FYv6wPgY9|i>G>)7aZU*)hCV}>@n=cN)TO#?j z^mpoVS}22}n*o*q%QnXw_fj?Sz_Zhnqhc612tucKk6!QKOgGuDI)Y1gR9$D@JPQ9^ z$DV#m3^G0PdI?cS?z!5Z&JKp% zjbM7iW8>`_#Y>yA+uQll%DS#tCWU|U@-O70u-4))`buB$BchMd1sH&iH$4rMo9AL} z^LgVsG=j*vMSggS*K50FyA0B?WDwY&hFK|^M;wZ~R5n|bH12hKr?<;}isokfu@9g| zZFDq&hGXwad2$h(V^|?!^%=eJcz9{r;YS-BSzm6#V~B$G<7^(SVmd};2B-JDt>M3Y zdDI9kIXhVH+sAK?uXNO}WTulJ9b^v1v8n(lP^EwHS{Cy5TBYWtvN>z;wN{pC?IOR5 zJ-_T0DxT-#^McjXJV$=NYfQ(AMd(YB=;9-DQ|Wn5Tjpve;&QBU>A;GZ=ki2L1GZs_ z&=|j2syh_${A&zau?x{jriPV%JKgCqo*HyIMCs4aq8gIjoV6-Nr>B}a-Xu*PyxzX` z8ekEW;6Dz#Jp6~3AP5vGU=bz_G4}I7-7w~Dh^b<>8QF~Yv`Y;HY81C>J`vCDSlD)H zV7kh|x@$W+t#;JGF-FvTV`8R9)uAsv;HD1%l85kLIWFO$`Ng3X=k&!e0#$zHFW2+m z9B)TXSqnX5F|$b7Lt>J%nb{8J*M|5kwOYYbO^bAIkY%l4jJe&G<2Iv%G0?I>KR;Af@E1QANXN_!ip5oVOBdgb-Lx# z@?VrtIp8-u8TEKLp-c!<()zJs6-qN0&KPlx-U{H`>g`E8K2H|t7tOV&Ym28l-6!|z zr~$~)Qkda~agxJcTQg#Rd5E$`gl9A%G*d8`*UalZ2N$BAf0DT=z%8K=m&VL#fqAar ztGIEw_)#{kE<-XJ;E$;_QXt!whLo9IO0Op$E3)?cwo{0tu)ipn(04H5&JfhOm|8*N zxqxjc0jqCsXEuI$dK)o3m&Z*V5w%znA}<~6#)E| z`Yu40AsggS+$>O!?_G?O0Rv3!^$7|9tgD{1istUUpwa*^2!n*VSf5ffcO6&kS5O-IOAqfB*-C)li_#AwWn?DjGF^XfcdT;?H$v(V!ugsB!rVo5U^*U6M zwSLx@D$w-`@NYFJ6hM7)HL7;BHGZ*H6Tj3>DbpL1o)5atFc5VCtK_lr1)cilB8JI+ zi%dHUAPEbI`#k=*a(-TbjCsOgMfBIP|2IWFq!Z})K-HnsoBm4oPrm%GkN+L_zjFDb z6aV@6|6{#`DDmtJ(M9+0?7&b-JUD*CdxY+UGC{cN6U+Y3b%jnsO&vxp2-LdoOFih_oA^dm=a?%UP(o9<=(yh%uWd0BJoCG9{Eto3enfMcqV ze+uooDX4*NpX0BwXZQe6kco<)dlLf^cuVEGcSyX9j%t8NVp7jGb6ukfo z%PvjO!Ror$?;9fkYZf|L%&(3s2pp|B1o-V-)uTF!wF*Z*GwzW<&nHi)V^S7vugLcK z7dOUHTU?sYs`FU$MjDcXRgt2xnE+ts&vwSy=3zV zQ}um-svqtLuyHnDm^OF|N%%U1|bre_>-{X)DwVd+bm_A5?W*0?A)XOQ_y*WPvF(D z+y1Q2d2n!2+!u*QTvhF z18pOB_v3<3+lx8#82D}8ZW6BQ13vZC!WdLUlhl2I5F?t7re?SRGAA2Z_42*4?xCfv zi-sOmWP-QFQf-Pj0BQF9V9UDO5J*nTRW`eI5Tr%UQJtrAm55XQMeQ`ue`M(6^yvan zxC_J&ej``mSKxcs-x5+@fs_5*q5k9XU#1tZd;Ysl|1xO!2k&Lj36lzQK9Ds_m=KhW89Ai_ zSy2}_t+Qdcyu<^bDW~;`%+JjEgrLYc;UAoY4r^QRB}$KNU!I-F5L<(vi;Bbm(&}a; z?*Py48m%gay3rf16Ec9AQXb=$)j}tnzg}6&Mt=YropURJH;|!b1LtAL85*xX)AV}0sUOOxfW_62$ETS_YghY z(WZ;OOw0i7IJetSj$(}jV(tY)nK@A%EDxx$Pv^u20EAXN`t~ReOV@I&2zKV{EUVDd zI*_68FC8GFs4IPPL~X%QHQvE;AeJjG1E{w8bd8HkV6iPMmM$>U{=Hz+SYQE`^99|| z1)W*)U{JvhsCWqTzlZ35VQO04FkRYG@<)Ikf%ol zr33&xgco5+yqBy)A8b%nzs!?uF>RR?B(4qo0}X`4C;2DgazRA^3bTtG&Cp)-2v=N$ ztbNl~3>|1ldWl3Ew*=vP0Wg}XnFVuv(GalkCjADT!5Z2NXh3;6N3%HNL`{fvZ@U8j zthzg`dKpwqN2jwk0a-|Ff$;RA1SwL2dY^Rt&O9Q52@FUIh}>&w8_)w8vWjGFJ#N+7 z$sb(*^g`nYMSP}`MLy>1(X?w;Z4jo_l|sp4NtbLMz26ajhLjEO+$E2Gvvq+hFYmNd z4pi6duW0K#1uE6=0VY0m{`?H{G0QMbIqMF?=XuJyMZkya&V$eYmS~`GKs#-l4DVdH zbea(C4FmNIag;Mq;U80G(+tM1n>)GCkbsb?Q#=hD@e)7C8c0A7%XD_UHp;uEKx9)x zYwv4j3w5W{IFe3UsPoew%-1q14x&Qn9@w?xDydS!_Ua#{! z&SM|X<2YHGBOewGfA|fGZJ(f63p=wq{Y{k6F4y88H7XzM+F;-vSY|D>ic<~WP<#T1*DNoG^Y+$>4 zL4VQ5?f3OEEe9Q?fu5}TWWH|)W1zkmQZO1ii|nE=a3puVh`1qeQl;IaPTnRVCdf%5 z^FX8J4LeH%<@lqK`3wo$v(sw=d52nTBu0;Y6gy;hxof|0ddAH2<4FKHir=2G$R(hC zh3iYF18J?hv$Zrhg8ENTJzvdEzUeinx9M+*J? z_AmQW(A~z-yvyi$x1;QRyBEi6_@jl;u8wC{N;C;G4Vx@wKYsZA%CkqTRb=mFBh#3l z^NJEyUDQ~`!&_|O&%p!4oWHkggWjQL8W(;4n}?fnH#hR^z0CGUu9<1eFKYo58r4rizrQqxmpIVb}vdo?V49|(O_4|c|t1ZB1`ow zKMI?_z@4`g(d^@n;{1{A*{i;r^lZu>bAE8LtM-U7dt6&!N6srx8)vj7jrgSxjjan{ zO{VuhR*MIparR%9bB!>t&+F`Ui|`)*Df450QW)FZ6?Gfb9+uEaH?{A`n7yyK)?qW# z7-a7&mijlxzSBr?e)!hpYe0!e-TC9a&?leW(TDWDGJf52m*oxdFMVL!#og<7^XcaK zh|-VensSDuahS#*Lz&#o_Cb6}Vo`l=tKy%8x(G=a={Yo1Kd|Lx$kj9^4!G?`Wu6EUIj!nzUBGPTv9&wEj(W=wm zlMR3h`4%ES`OLyUhRgAryj)~(|vN2LV`S9=G}OrEEopYQgpl(<3#{@wDC>rSho zErP#w5eqGB%lFmyT)y0QC?0D_h7&oMEph+H)kVc3DJ5mIE9SXb1q`JRZ?U~Pq%%C? zq-easE%`&IjL`-s;)lsVQUNk=0P8F_g3$@p&y5)n^c~w`6Z8;x7&}_d2TUNTOO*D=9uzC zYCh>H`pA2&z@L7gGoPEjqR~@$=)~_=)od|6Y6*_#=TUZMr)!7?#XI`7ef#y*Z_VWuC&>~sM%*lo2RX6TA} zldX6t!8h>4c2}ND?DyK7>LWD8erXwsR-I9Y{k%7kxA|B5$@Wz6WA46N3OjXDzt7;g z{4hxsElI2BcCYhUM9ek(uBV@_2UNRP#_IkXrK<6q_@vuvz4iQbATwl93r_p(^4`le zv!=7--P3dprPw5f#Cr|v*&iF`E^h2zZg_9&s>YZyJ7>rhFCV1srctrJzO;4CGKc-_ zU*F%}YHG1y&X~@pl5UhvvviKuJx%Bd(jT!FaOt&DACI2{XzYC2kna82bzQ5Ex12Gn(Moa^`J9-Aw zt|Ep=3zZ*idZZB7nG|?VU}dS=uP@&0EH4mfjP7R2vr!s4_{5<)%Cy4h?#ufxvVCgI z_;f2n);-GETALNW<+>~LlL+@$%`A@m#a5WJAk**1p8Sik?f2Kq-c60xd8zlVY09ri zcM^HxAi795;b>s~B@ru)dv}s@6Ri5Z{2O^;pG7OZ61`Vb(kDW{o7+N9Y96(Q zgQL-f9)MoYA4SeOnYGm4_ZrM7qO0dgz|Q>|(MeLzo;6VZ;g+|(1U1q+zbw96Yq>~z zqQiil`>Efc03%gI96H9)iFS_Y7-QnCUtOShys#Pa>a?4lcc)=V{*soBJ;`P>ICWI! z7(G!IJ8Ag!nRfN_g8caj@%2Fk88>e zI-jX8l^_}<`sR7gjm54#Pp{NC^ZTh7*=6;0(@3rJgE928>f!czuD{c~$CB=g1nX~% z=eYH7ldMnv%}to_AZ=hCTymO&FU)JO`5DU`_FSnm{(~w{6xY?0oh}nSZt9AFdZ*iS zXBCZ}jsDRx&Ru(3x9+R0|F-?zr6)Yhv7Y;eEnb)S%0xQV*5xnnF1!5s=e?4$FK%%j zkA_Zq8(uGfa=X0Rw*G6{nC6hqX<9dbQj&F|bd)HSrWVz22LMp^-ugp##YJwL?cI{w z)0MD}6%}0FQIDvUnh%+yo6%1I2IC9r(wngLSzxf5s}_00c{)Sw)-+u%! zS$BU^RQj%WGI`=6z=)^Ym>xNi>behpD~oYzz0U2v^YjEQtK-=r8lsXB3}<$Xb~Yqj zbLYI2O8ntwnu&d~M8t;ik;dW|+2^V^zcFpyFP1EIE0r1_P?ECiWVts)G4EN}uG1ym zinI2N4>2Vj{GRk#0s}7!fvdHtTcmP3)kNxE_Cqxtsln?H%kQDnW%WXROpvg%y3?4* zb5XzS-c?J}7}rAd|9kWx=`?Wh1lRprV_Qu-Z60?>xSPgECx~X8u-BQFpJUrx^YO&| z)K#`RyE~O3)gQVwy$ZbEOu$bfGvGf44p`a|5@=Jq4NR9WGvZV?+mhgSRlug~&FVY* zb7Sms+Zz_T*6p{1>2clHZ18)#-N&li9xHV{){p-xQEa{}FtkhfN`A>lbJdl42ln@f zpQdhnAfCNSs3zvM$?=~Stq1pt*}I&G8*9M~lD@b0_XlXvN$mNO4zD%~xD2PL{Rr3h zNvNkC36%EV-I$}@uo^KwOh~gtsB$I4ZxMmPhMdoUeMKfrj zOk$5%xHUb~1XrzFjL{PRin@v5w4DSE#eWgd@GX`FXyJzo^Kb?ZCw&B`az58am$ z7>*@~=FCGV6Q7NLqYH;fL3q*P=~j7^n*77M}^br zy}Oih25*&!7z{jUm_VyZ?Indly|i8)n2vyz{aqJ#SiX*T_#PYh{XyQ0T&ea3d)kW~ zG8ZhYe=YGh;n5*pr{U1E@C!F`*X_Oa>bF&<)kvL|TXv^|n{@H!H!l;`+&C?>+4qU_ zP08;o$E33t8KjQxi2ZcoMqJJ@-iz2a4$q5AC_Qm@_G1}R&#~Y-((TV|`9p=H;<8Fv zdirnig9$61_7>@JWSZsjb{beM)xY{Q(i5rQUWO0EanRXT8={LgY$*YU!n<;4@7tSc zN%j34pgY_gXE~^NCR9*i^s%Z)d0&Qrt$EAB+Tu0k2ZjB2y*~SL+?vj{!o|M?SMObN@-6}6o^ z{n5nd9T&di>vzn=b;kJ5j?_g}jFVWDZJ`2U1ox~)53-jIEHQBd(!AzWI^ zHxEz8I)8X!NnBiH9_bI{JyBMzytC(81=K33I#jN>VSA8f==kJB+M7MZ`jhm@-~UBk zC0e;oyf**lv2S~vQ53ncPvPe>hx1|vIZ~tz=M(A$uUzCqHo2z)jC7flO>8#1lL9nK z+Zj(?R!oZ4G+jWZdZHlCcrI!{#?&D!)u^B=s47WEqBh^v#u=d7_p$95O=DR+Hdroi zTzKSevvphqLA~~7A<BcD_;@K8JVcs+(7E!QG852#(P9R&uK&9ieRX7hjv|@f8f%F zx2#9K0e#t7E+Frj74J;-1M)w1xmG*U*dKbfHl_>FegX|9H}W&fHfSuyJjTf|)%Vfe)%6@c-Z+ zrf6i0-D> zWe5D}e)z}ZXVuaKS|V&DTX(1~LDQ~BL{>MSMaKjtxdQXlDuEfW2|j}QPbfxW)V*rt zVU~WKUxpl~{Wh`DlF@w!0=J6ta8|uS=IV>y=Rf0N{z>)%dB9%D?s^TeRf%vou9Vl@ zqw#l?1tL57662f{25&!2qN5afRK)!#@;Wz)yhdA|vEQfZ9ys0RJP8~5D*^aPL=0x# z`s>Ejm&I>y@n%0;d^ChtOg{vr%9dGAQ$?wXr|SW(uxorj`*G@n79FG=_zl+5>sC3J z3ZHHI(VznCp^9a9ugU$i~t z*0o4gek;5CWQt9dYanU+Ah`^W^+a8efo~o<+{egBC8^3`5-NUfFZghO=WOx9{;g!z zSkot!pfw3!kKJXX(w>|;viP5s1i&HS8jD=}?k7P=4LD-jQ|js?(#6-*Ha<55+$Ywq z6+@9PE6NbBH&q}~-=GiKNa?|B@89E=Kn}%vjSiA##rXe?C957LUTtwTY((YOo`sX& zTuQATh0~>JU2&;i-ETiU#7T^-mhHemAK*kz->A-WJ`|laYY?^JwqxHb-_%YH2l?ki z6~UqI|D2yvA-GUJG5h+{yM(KMq7qs`+SS)Pz29B%@c7tBsAyVYO-@D423=LON7$Xq zXUmWE=1(50+BUmV{uzNE{nKMnb>oN1tXEI#+~~?&jH0T9yL8D}!Q)Lo(0(XoUGrFN zQ+IrH#mDOCzM5y37TKW4bM&ZpcU=N;!v~8}SvDMe%vzhQw|>3Z)0wS14x@@FQ!#?- zLLbizn>>%gng4_K_~$A=cs6?TMRNm?I&32Xp2veux|ZbZ2K_D+&w@@+WFDPT$F z^%Q>k?0Zu)BnHikF5Pv{XJTX@DN5MSnS`eOqyAUYh7XlkW|AA|4L$3Y{(}U?&@3U^ zI!4(aDZy*>4#w`gWv?*yL$fy3Xw&FR!P>-+ln};$-STcdpAr!ZQ{F#qME|5+gw(@8 z4r}8I|9}2?LihqQ@Vl~-HUE=?$r~c2U@mG3Lj?Zyb%2=Rch`OOHvfx+!Qr*Wfya2P zll^zU`!ftK^T@yQ34?D9;%w#b2krme@2-PV8Z4Gy|M4`Y)JF_VO<>oPwtx4#)i4t= zGb`Tys|8`CDx=v-q|z_%-~CPl-D%1TLY^$0ddWXL{5l}z+F^Xo|L%8p(54Y6xz>-= znE$6JtWt$;lalEAGn(h0|KPs|_}>HkpZxM)3H+}F{)5_?3d;YhLjS8m|HYL5|IG$! z&k2#Sxtr(y>)3k7?t_`8)%&ztF8t6bun4xayp<9p-KS%+aV+pI%ajooiiT6NAZ5M` z$D~~zUm2O+-Fu}YWQSkyFNcFnMUA$fR&+glbbO0dy-!bw(VywD489NXz0+Pt{&Znt zI=;~1pI#kde6NhJ`-&yBtw6MFeKa?@ja_d#M;0Iq-)N>b%Z;f}ubMK$L?(BOJJZEi zHfEpYjt1U~3%N9Pf>KuaYZjyXy$k=iOPt$GJ2;Bz^EYX{HHKTm`m}da1yq^BJ>uW1 zow4-K2U3Rk;u(Y7-UU;B78h{bJTVWYCih%=9ll*`;z|EM$ba}78hmhSB0_kkG>H+M zs&0|eo+$^&cLYz+F7EbAnZ()eM;s$vMn4oc`4^GgaHpi-mK|#R^Fh1>s51}~+oU!{ z=g7_Yq|?!Sa$1y7Kok~n;ochlDR-ze5+AULF8{Q`3%QXlU}$79*fdoSSprq$UkLh> zz33OpBhtuzN+MXi``v*wb7L^=nqmH%+b7(n$f*W^X@6G>1Mm_vURmh1Vz74w+sbg$ zotGBsnRy@vzm*4czG@OJ<=f30|B#mX9^3PP^jBZn{H>I~HuR65r$hshGc@Mf&X^*q z`T~%-Y(ub}m>;j0B=Q|G9(~!3y^=@KZ$x61?{EDi!jKDjrPk`+;WaVxoIrC0B`_v6d;ZFB-gXHo}A*0I{*s= zraCkSA4eeC5~QJz(C}pC>bUTOjd#tTr_QC#oxQGN8t!&7WK4+LF!n1tGL$1A~g z2T#epJ-4)X?C19nWK?y@#l`Y1St#!u!x(SNyw0G^mtQAyyn1thXwdApTMrvy$a%k@zY| zM-IOOJQDdo zlW13YM8=}Wy7n|!HYDp!N70epSig1}MP}NekB`TDcjWctNS;<^50>&7!BBZCw+=7& z^+yML3r`!nRPV^)sR!oBE=~~vw%c*}BgnfD&i~fc$dqf?-pJ60;JV&uqQ1>Q2}MIl z*T0HJq3vlzOdbtrM6}M?l&iM;q10(L$ViK6XHbiSLWb0s^ ztI3rUE1A_(+a|M-nX~HyzeJv#CD}a~7xguD0(Jw4@rSPcHp%rggGz-jzh$SgK6@s! zK39KBO(yf=^FgHgBp=S{e1XVR2BP=%$=Nbw5WP_;V8Zs}KjIWS#nf+WBR$}+zbE3z z+w+!$H@?i-;5Fhh(YS-uOdWsE@Vm>owbHwf0ykt+@re3|!4g+PrKx%~Oh><3#@Nlb zu59CIfrUmK!z950e%3S7Ba^2p0iiTYTQN*f9l5VO%SkH8O@Z?TP5U6ivQ0Jf?XW1ni`U0o7fy#XtRTTzB7uZUpD9 z&_gZar$3*^B=UdDV-^#?d!>{JBI5UWa!d&d><5}h5b8$ z*iQ6>eyD>Ah;GPp{^cq`P4-^5yaq#?t)l%aP+!+^nL7}M3dN0mieq(I zHjOXFZID^3Zd}Lo$MDt9L((`_p=UZ*AtDjbTtWwYnly(;aHtCW7k*i zt~?8I$Aa9K!tWKpYFIR&kzu36HRdUI%@tZd@!bpkX?KmzX1*eQ1ZvMNxHu+AKQWmX z*Jenj=x0lU=U4SO>CKaZl3D|~Jx0o|r4A?Y$OX69%u4*ML51a_*UOggORq@BuAfSfcIU#z zjHN2(OzX*2+CrGflREzhT3uSa_<`G4u;FmzPe6k!(vA;^Zb|`Mju@9Nw7;xtFq!<}EO<1;+$+djsxWWK3Wioc{3%ajBM zrve?^qQsN5Y_-eH-irDCm#dEe))X&(BK6)UlV=69^*ihERnw+k2c8CoG5i-wYOYJnTlBl_VLbch&3AV#wL+-^~SrA&K4h zF257JzL)c6-3nf@(EQs8dSM#!r_X2Y{~5ejRApy*-QzF|6~CFkEyfd)_k_x2Rw|0F zTev&#{I=kq=IeHxyV5XMva(nBhFVz}>&*`fpQ)ZY`d%Tp=lF-4vIdSXevaKcUYz!& z-C@+r{fitJ$47-~rJHw0{S;}1nK=8bt3K77Nm*G^DJI3n4P!|`;zo)70I*pwvu{Py&WFb&)` za98f_yLSWV0Be3$U0ppmmN@?F`##lB$sM>vZ*Tx(r$55ozHsi`y^E)QX-+aV7oCxx zcAK|rU`hlTGpQHtb;rKHR8-!#ukj400DnSXTb}0g=g&8Ke7lqH*4HgCiq7xAkdTj? zK1?}?54ZW8(LPdlWzDn=3Yu8f)%c_&O=IJm(2LRHM)^%car|ITiDER?SvRF02$Qx6 z8hfnN!7yWLH8K>bmra|TtlcHUXSDq?Oy^9J?^HGeXHvSj#PdHv27(+_P*_^{ z4Q28VLRKg}D$BaSHG7Izxbzv#_-UP;o#_6{J@LZr=eN#o6ql;f9&tlezqse$*#Q|lIR2>s9Z4`*V8NHi& zYMTTWKPJ6w0+LUjEW||?U&-wC+AK=YH0Z)JRv?ley8T;g>q2|nsJHg z>FRS=s?|O97ZC7wo}%7ozQ_ZeLrXRMB%6jSq!OSe z+{}7VUaCU?`^8Xw?4M=9POxBE!QM zc6N1%NK4l{il3bNwmA8AB471%754#QjC@qhTwj+k;!v#7b$BUn#j<7b_wMN$gRxuxlqR~~cqpJ3n=mCy=Z zIK{ob-pt@$c}(Xg1vP1_DxtpS%v~5R|4a58$0ZrtCNhYtSqBxVJ8*za)|LrVSPj>h zN+(GwZjN@&9^jjOy3?bel=XhtxS&+Z!;Ji8ojllqae|b`^bS6dSM;?P=wQ%p@FMGD z#|qIaV$*R-e#MHNXmdQ9n3$MlS;=3JV4=#7$d#R){l%fD02Gv@mGl_c{IGZ;SN!hH z22-z^?w@rZ5AXF4=juO&ZS1=qx@>A2jJI%3qM~ISit?R)`JpF?R@ldS@sj1XmYK7I zxvZA;fBdF-U`)DX=X2x3-}ZD#^MyNtV)k=z-m!FU(P`E%^!bYC?C4RTH8e7nmz9N? z+j_OoafjMqEfuDw)jXU7z$Yyqe-q8QsnAxL3Q{Gg@ z*{tbbzbjqY!|l6vu4y}a=83D*kdDJ_j5<=uiAhP?y1MZgpVRZD__y0$>u>L$U$}6= zDg2QT#hB0Z-G>2phM!Bpkqu%)^%u`gQOnnqdeK1_c$K|arM=^&hHFgk-oDk}eCs^x zk^MgYxA>COUa;OX@Nq3~A6V)+<#7qAGe!&03f%f?urGm5b9v_UZD(isry3p6(9no# zYHH$;aD>Cd$Cm!Jfc2Cnm20gro-jOzTWE^9Rf&|HvdK4(k62qAI#f>AuxrWMga0rL zBuSsghi_YnAI0`pp{}db&XRV}^>eCV^|TQA5;0KTy<6SU@ovx>y*X8l6s8@f3KZkH z*k<;HYZ8>te5UPA-ki^;R09|5_L7fhw4IzXKoTu=^+;G_>Qz2xFxD1ql4;g9Rw2_n z3_U?tEQD80Z_=DB5g39vBJ}We%PJ~nE}%$U^l79v4SS@#eedTp%B-8)X0lCkI&7tI zugyQ8O{2x4@w@rOq2J?U{8CoxLV4?8#($ABWO`%)a?n7IrW&p(k4%Vb4k4(vwzmF# z&up*RqN1bg%$~+WuXH!0+<<>myQUQ0>TRek%V+n%tsj`1hQF%et+{OJ_IW7>AO9ID z1UpGxY3lN*8}OM%Km@tf&(N-kii?X!79WuOKj~8sS0FGVV$p5|k=3jB+uGVHc|S6z z7Op6H`qGAHr3tl7q9p&{s6?eX2gJe1hG69Kln~fzOE7Zi{uxozL=g{$-6>yQS$xuG z3yyWB2+M60hW)6zx%!VEFPo;fE!D6(Juxk<5t=j^sECBZLc@kyrs(MCdZ1`H1!VSq z`t%7#(6njZ>Fr#s*$e{3>KxxkAcQSokrc7Ao@u*g$uYkpeFU*BFa^taczEQLdFRcu zdVG%ilJorS48Gs7iX$r_v#nS|8*Sa&=~)*i=92T%(TT$?M>?lZpLPpEKC2FSfM{s! z9;CZQ5Lslq_RwxIsEP_Q&tToap8nPO{Q)jU>dbkj?UL!MPMwFP1B=+3Ui z9*@<$pIqQ6D=n=8Y^eHNbVi1TwY7CifqNRu%=wWuH8nfmg@Y)Xj55bnRqw$HS#p4i zMuJh3sb@y2z5#h00`qKoy$JemG(F(}k98e?`#&>+lWmb*sqE!b!8@qjZ??HE?0G`dV}MZF2vt1-USb1w=Z>q0GKL#0zfoCFcd( zOL~Y1j?WVErnarb$rpujPfoYRtdfg9qWrW5`Q+rII&x!DQprxv&VgZJ-BN+;sGB9g zFy9>*8MqEd;MG0KJ4>cttxjHzyNi;uZ5q4LC|!q}Ex*WC<8G!N04}nM=N$Q1B|O^W zdwk`3cqhk}2MmFj_tfCH{p(|ZQI8_?p4i0oGPbWj^r8-6j^NZoX>e%4h3-_x9Qzz4>k#3zQD8>APnFh)dd5fUY?gxFx34B3C$!PSTkl0H`EJ zMJ)*o4o-@S+PZ)L{%(x)!C1>QG=R#8|(a6T5ss^?gz94gir z9~*v}tE(%fyoHlXSH5rX{1M-hZ4-xNP=mWae589>kRKI+bxe%ZiaFriq{Z8(lg+2} ziRWl%mtyQa=6WAkK&ZYtXrW^((AKU_qXogYjRl1Aaval$`39vDJt)!4+0mfHO;Ex^ zT6oDcC9Y3U;zsIa0a>+%M#dDup4GmhwB&w|y*D@O0X9DnVX%*#1iD|su_!BJXp!&Y zhLA^Raz-hfSEgn9HWx&b`1w$cO&&vitvbl!*Hqo!<~0kRUCUjwv&aZ9?>T~G@CYu( ztIx4B=vX)l$dv?aG}n=?X6o$f>1U>h5p9VCHS(l4q>!H^UYMk0M)NNfW#wMW zKxti~%I)(Lnmr8vZ@m#15FpfJbDWk*9JzIxJ?DL)zH)7w4|@AoIZB>nJpvth>sbfG zoy$HPSc##o)Yj>Y%87~HUB7+@1HWx>*^#oEn%iDy00+y;%WH9oN)&qfN{DlK#0U>* z>|lI<*=Lk!FP-&E{yv0QpKM$bTN$xQ2g(w&LLxdg?+f2>Kv6h7Hg+dk0a7sJAVD_+h_9VgCuF^y^l=W12CG%x)Oe znPGPDpv$je$Ks)U`j4~WLaz@%ycUZqBrJWc0VgAv_fq<_lkq!__|7dY`bf`d3m)tD zmAH=yPe#~kBb=4grMoUQ4W-cMuMrWsIv~CEIM;u`QIC6idIr#dVe|FvljDn$_tEa- zUIcPW(AbLuF#F>?rtgN*n0K)Xy`F{DxYwOuHNBdKIctj!)CXAU>4hq*sJQmO-ZEMY zuXdG!!mA?p`p4(z@+?}^zBaIfRS2$?%<*7`gYy}w7eOV-)MeDa+Fh$enlmXtr2j4J zk-g+x+12t{Px&(R(B{v(GH==mWS|<>r-O9Yy*l9BiVXBLz8rT69C)6Dn35VGsmZP6 z13eA7{damg)k+sCpa)4X%>xH+OSD3TsNPaAg0pm2fM(N>ynpM~6Anaf`iagh=3EL1 zjD-Y-x(ZJYtU(l{MR_Wl?BwJm1sLRtp9Y*s7!_clP8IS(E{ zQ$+>YXxfVHe>bW@*a2yk-n8-hy1ID4b3r!EJ$uf7`}WOTIyqJ?A`z2b_LK)p;f*74c_#JA$Y7t1Y!v>x9vYfJ zrekp?UA#E6;Mi0&{|n9X@c=|FmJt24^z|BSw)yOS_)+`dM-_%~aZl?{W%5u`s8LkS z_V{k;53IMFRq6o?&L7?NDuIE47{Yr4wZH|+>x~5%8_A}<*!D0mzJwd)k@|kwRF-Zr z9|Xxlo-G~P_sNqd{LoKEBHo;$J;!M@Gn=uI0Z&v@HXGAwcKx5ak3USkry)ht(=)%X z%J5NQP7V##gB-;J*$y2FfNb0SM`S`awH}!Cd2{EQb=bBraKp4T zFyG5pte_8c*S8dT71;d%6u28SpBclK=C7m+$CblR@+y^b6PDteR^Fbe0ytLyVyz@x z>c&`=VBBzxnsU_0wYRC-n%|67sNb7-SpxrymsNDYBF6oFDR<>fc?0BFDlJEWthL5) z8+u2PSLQpSZJ*-%{6`NXH0mP0kjc^|f9kyuZ+*=Y9@xoKXxX=sO-p78st@n3jy{V# z!@LCxn%AC&>{d}AyJAWQkqDc=9Oh9=j+JHFJm!(eiqEF5v9Sk;>dY?=u3zVWfUYos zAbSh!gaX2+*KVC$F)=j$piOI=pM$hy#a5f`Jt2Qd?OU41k zhaWMbK2(Ms(+#|4PU6#^pp|8NuTDM4XJ1K#x#BSf28N+M+qY;9p9ysRl4wyL)XaEG z&iU!MRDD^X=m08C!&j{~tP&AHK&A0$buo_gG7eX9ts#`X8H~q8F;3rUGbTKt_Yb{X z%n5*L@GxYxF2z9B2-C_Fkw>|qvtYr3mO_tA5m8Z|xpSj2H%AZhSycQ6lnjIaYD0g> z17RXjUhfFkqA6j5TMaJ;M-m;Q65L|)Gc!!%?%lg^B7)D%7PQ}_yN@%gh~>!04{K61 zDzcPLO}=sWJA8<71Sdap4-uaG3v~i2gRe`ExW^ zsTw@Ko=DsMSXZOdznYb_X2?O()NF)?0Jjpx8Pt@`NPF}lTg#TC_^W?gPe-wL`hAown}NaaL#qP0 z$5gE8nIfx_P*_Rsi{V2*e8aztRqoR%%R6;$hVQRACH9>~b2gap1pf0DhBNk2WEzy@1=*1)TX}zr-<8Bs;0$e^y z(u^#}>-RP>F|yvd!Qdv^K62ZYJW_}OPmgS-YAi}`LDR>?H3}$twUD_)Uri2OFm`kHk7-40uuyaa4rng|+gySN*oh_- z;+K*8j_>}I3)qJje@oE0b3*ARn`!R1WFzyFlN3Wo4I!C^^DU3A1p=6plpYjV^aZov z-P&Cn^)M)>7k40LD|n4qT)s0N=!pNKj$0TZsaZ%R;xVpUOj$y<`C~xfSmGUyH zFX=y42$ynJ{k8jva7jtphWwoxtnW|bpd`Y!&zh!Xz(u8QCYstz`R-iIhne9kg5G<5 zb1wiCI2H^17~-=yitZWTU!GDLt3^z+O&?Hdmkj7%H@3!!&N}kxtFpX)f(u$_n=zx8 z*Y>-_Z|eNvPLpT5S$zw+!E36~rC^UqUdvw*5av@4q)Xz7{YJ9#&DEC`$D2HdKHFgk zwX)mRr%oiZ@Ak^$gSP%6p7ea&5o3#i3bVfL4O#nO*2a&KrlX*H2JT~TMgH4vY;-6W z9T0KLH^xg{@~`7EeI3lNUBv~?7siQhRsRRV$_bMfbMIQnot)%?n*&|+gAn>|lH~s>WPQF|kKq)@#RX+(jjRLILg|RH4oMpMME;h?2mrn_&I>*9S$k zEvaRyi09pLR#EAYGEmOEK&}m_-TD>`ag)g5AezqH%9292k_mmCW(Bot7udalyv8arB#o_=pTsanumQ`?E@e ztfsjcaqIPGPBSaNa6rCK;M|?-y?%tWLYhrN*L7Zc;w7RUvx{wjx3Dl4iT&(vJGXmR zCfxH0Tzt6=lXNHx_0#CU7sPLM#?_Y1s7>+rHuh#&1nI?!%--~EhmWsG(U0>|T&pPH zQ0}aY7pK*wDYtJOOS0iya>=zRXCj)DDoBHU3n>r2wsT`oOh(2L%$UTSc_wRk7^UzJ zYUQ1AmWp-|)CZJq43I4#{dPQZ?TF=b;K!?Zx?EAlRBI=@qI9 zEkT=MjmIwbq8J6!K+4Vp>xX0})ynY0Tl=lvqr+xSvRo%WA7A=6LAJWyH1<1QDrbCqtw`(8<3JT`p*k?R>PbzQrzv^W3{ zmkZTnf4((Nb`8Dj8gH!oWkz~V;Npmxad22#0E?QR>>snEZO_k@Fj2oZ?%c}y-XCh! zfq%#1zr+GCj%M_IPh~Y3;ftYo?M;Kj4$6iA44w?tLK!CuT{E;-BksgLWhOj4OqEQ485 zI~)#g;>Uat^z!YvgHz8Gs#;2A)g53R?9yq$Q?v8*XsKQPGA*U3iG40=0mQ==S7|06 zJG8WetjwTz_YB|7WM%u4O8>6xE!%q$(*ca!Y@T-y!=g6Dy4*!>O%M9sW-{QxJW9{cvHFG+Q?&Q?xqmQ46CMP_+5uDMGjREf z>*}XuXeoLtb0PqicwlCKl7+Kd()-G6Hy4*Cz|rV%D1uV%KIMO4g4=#CT)Y#IvA$pQ zP760bC3FRjCu_uN^B>m-x;8#kDUpdrD)B}hoq>ZuYqcQLBe*(zvCSqx5>GyP6u6~F zk~{sv_9tqrTC8jPlOj!-C|sqipyJMXJJt&m7iz*6xyE`&!BiUX9@8!Tr~$U1AY;I7 zkeT>Ge#mgsr#sg~Mzh3g$q+A8YYZ&}|1-C&rZ*uar$t29*&6dm2ICaRo@4r=SgpP` zgiOhT78n^0A8qiR;P0j&-Gk>X*GFw;eSZmxyDed!uTa2ID2BGA`HX)TxssAFOE)2g zBwy_AKH=R`MMtEUi}V-u!RnPy$Xsc_&eCBb@{Nwc6t%-vH*yNi+QvXhSnAWR9HDH* zHBCCJ)xLEzl9!G7NfcQj-;#w9>nGl%knd5`G#mzeKS(U50F)cINuIOi%*wbxaxk^{!+du0>HMR)7 zgn{&E5YN~$Z6N8C1U6j)@*M=!Q9HM)H9E2$UZr5~0!-Q@ez>*`&42#7hwxkaBA%?E z3URG~7TA003mnH5xO6qC!YwnVMJr52NSZN;)FUyfv@YER#j6|)>612`#jJ#vgY)$Y zN?Cl55zg04+s!=v#kUaf3rimzXMG<6!Y>r%s}>5-#7@ISc<=tTm@|oMp@0c+M**WW z3ky9qgzCTEDS0_@C*JG$-U*S#cZCBE;CnMgFUNx3fApR2M%FtkOtp8l$}xjiiSM%U zfg?ZEuo=i9f9B?ezejtSOUSf-X;Z>Hio6QU*UN8lZgB41@Whr-N0#YRE0rXt#=Az> z^xDdOIp^oDY(ZV((8w$8e6*3;0`h8P@n$}ieIFk2rhtJ#C_oNRs7ScT@^?*VSmAYN zAAYlS^r>vo0cc_OgTfvvrU$jm*Q~{PG1C;Hwh2XJfeW-3R-T!5nqBAYb}i|KSY#4k zY)K3J0(*Teuk*`;;(Iw93<+u#mn@%g|GtRA)elMgZko=<$ZC$5LUb-Y>dIdOU_BAA zprx@dxcFC$*H=H2ERxEkp%FNiO>GO;DoLzjM}tDfHlZ6>MRT1%%HLHCkX6tDN|6D$ zvj8vbMQ3S>&dwL`?ziHYPy6)qAl4veu3LLM(_i{(l30T@BR4^z9U#?aVtZ71!XFB$ zn-OEV`MEdD*CFI>N7#!&a8EF(<`T?vgj4y6g8fHge zb`$#?5Q*~ryUzr&Pg0Vx^ApQShgNSM{`&4@iT0OQ_iwQK;I-SyYvWzb&nV@P*QO}y z&z^oQed4v(S8JZw(KwK!6hn~Ia6tLi6yZJ^zZs(sf8!E$I*GH6$aY|taQ75vv)q7e ztbm6ld3yyvh(9T*VLOHD8Aau1#XqCi?3omddM9=dVf77I0ehSi^^8&%Rv?+Sl{ytx zGEk(s$$2*xk7pwY@RDZ30rNE@e$=g%!-}hM!lGeMukcZtiOnpUb!Cd@8CmPL;(bZ_ zRAC8djc_pi`{Yur=FUlwXpv_m&tDQ5!E|N&New6o?ZDZ^>|P)RIoZ9Pp&*cTZs=d> zvnNs%lO0EZEQVt6g-ekoWEG-(wV+d+@`N}r2LmkP&4-7 zNnVSVFiwk4{AndRg+^Kjm=mwt7yYzsg(14QqtRynau4DEYprnTJ%3d1vIEG%w+&Z< z$Mhj!bwg3H6$fi`@GJbK7J#kw{}Td1S?Tku8bid_;SBiSJ!{6%jp7hmUWW7k<=TC7 zpcStDND!dNZ-+?F_KnlWSL`9v`PGg~rngnPjOZs+rVjyry7LhiM(G;$0N(qkQ=NhO zZ7;rgGuzLW=;L0R!oUTs=@SsTMCdUd)#K=|nqlG~un+;))#UfSN(bN*i6(b*FQkMn z$N46ATWy$rTf83wV^V&);~Z*+DjuCPu&YNH%-oc+guW_`6WzH7{jA_hMUsN8zx*SuiH8kH)M zH1?h1tUnkCF+7#r5>NfxZLq7n{p(PG2buYLt9b>nJzN@zHuvcp8L5mLQ47)_;o7`Ar=Y{ zi}JIJuTUsEmiV-bT+O+{L-{EL0#1C!^-+$3m0gmT-WK_c^|jK#d`}Umw(eyDO6=>m`W2ED zB(ia(Ct5yS%=-Q&!z``O+82Ge5S{kYK5Jc2;wmNDaN;x6*zt$-b*2~2D)Fy7 z$TbJgy{4owlPaXHO9^GuB~zXkA0IvdF&(!=0XBJ$CE9kd%LnR$y-^0eK zw=M7jDQsAZZ;(X!yOoJjWsW4VqEe8<@$WYt>=jiM8bF}dj2lnriX+vRZC`}T4Q8i# zi5;o?@VuQ4dFcWW(ATNMs$mt_4dr&HczkxPpHv zv0bw^_2L2)++rvWbZC42br+-z1~EZ}#YWHW{y>9KSEms5f*Qk&#E20jlC?Ydi~FZn;7I2L;S%gT?68&hIWzUd|Gv#Jn7{6mp(+LJDe9?kHPWfu( ziBEeQmhjpV?==1_2`TiIyZ|&L$q$x{J_P+a@D^G=C=AJIM$(Z0Cq6(c2KS$MhqOp0 zK^Q;sbrw=z1+F=*A^UKv5Fb{Vr@nNnQlvV5n)r-?A|{0M2z>8_P}CNady=UsZ|5KD zG3dx3*)h+7Zl2)NVB->Ra7i@}m@pYkY@uqF#YcgtSI-~@QEI2q0Hf&nAv8_3B*fjM z!@anL!IRO~9-EBR1fMY5^rg@_ZtR#Z1NAMQ@+9*OE0l&Bj?|_5G!5JPiq(9z-2j!U zB*f;smzsQRi^BM@*-@Z5Z84cS14F%^@Z#Rp*W{*khojhLABV~C*25KwXYC-M>p{I2 zn}-pI-nj*_7~2?i8b?Y}aeqD~B8(7a3_Us7WKejB-oQYbw>&N9(xq7j0QH}IM|vjg zvZ{C9^^n7M`h%24;9-L4X8xK;3;9OPEwSghKSKzTTeWIJ9OSy@*KX}5kvybRn%01Y zeaQhVjOAGzGKdIWE>@OfJ3c-0>EPO1^_xCX?xEH6Ify7(6Oxxi{yIJ&I?mKi8 z3Li{a$dysl#OIEU z?#~bDaA#fiLBfyjd=IryvjdDT?p4K)10^x$=LJ^~6ZJ?WbS;0p5d@d*_&A4O-^eE`qZFOBqvm23qz;~78MIME6ua= zG&tW5W|+=oo(KXssrvY6r`G!&<~j#L@>HI_k7s!jSM#rx% zSu#GD+AE&-e$MEF*~d+9jkbNTMSlcQnXQ-{U-K}~O-ZO8+to-}%#UDrJM_d3r2~B6 zSUR4om2~zgcD_I;V8n;u#OzQ@$RX_jbJduWi@ryEX~KmMT)l!Yhd{PJArY9`PvUNA zL5EMc6G}quIO0zYvr}+>iJ;J*T8@vB7~z$gibUWhn=oID5{mS+3X!PQ=VE;7eP=AT zYPHSuC!n+kU(prPimBQ``tMgtP}%q3zys*JtXR#%aNxDw(Oa>n~Li)**44 zG_I-Igo_kXP~jgutN7~ygkjt0r+waZq#Z4}(rcFb>OWHMzzpR&hH`S-z{i{tN$3oZMynH1U~IP$-$`Qg_mb5P#P4qlSIMaT@)m5f~*2D3Wf%})(F%besOZTO=J)3lISS{ik{ z*I$c)$*C#&5hKCe8%3dQjC{uEP|zn+0I2$j9Zj_?VS`FSs(4n-9nKa|c$qNrF=qpf|&>9bvr;#)Y+F${JIVBV*Fss8sP93(7f}^SW-iAzUQ` z{bILGbq5}D4G#&UUOfno&4Ja(=9K2a$;(Ydk$~0NBIoA0wm{iN7s$W@I;ZHTPPEe| zP_tfP7N~(mpte(@_~R2w&O~m{E{%cQ_90qu2fMBdewM zB21DaC7s;~qzk(MF|!?-XfJPmq4u9zvDvzEz9gd3e?|;`Y^aw0OFHUFoF}yO4@V$# zx@@HnQ(!ocoG^p03&bvF+w3d~+(I?z)`?FcjohVsOGFaC_Cj{L;iYN|%Dw0hnWl?5gth;JrktE{u<&FU}JV zFoJ66?YGe0>C&o| zK}cjZ3$N`ud}khB;DtZ$GgDQCm^l+Mdvrwut9d(-4>S2a5^RTp&tRSAzGgCTZP!A_ zQ(}UBAjZc&)IzeUK+a!2_vjKQGQ-8cIR^qiI(7Wxtk;@2%?9S zh-TtoCd{+$c7_>rZjKzh*_O~@+rF$h6Q34jpJO&4-hTH)s-4^~MpEr4Q>?pdJDKfS z{$k%w)_GT8ez>zQRdAKcfbITA*v{Z;Lq~l^GL}hh@$ru->+iJ93*0dXCC}eLR=U=^jPsU`M6_1H8RvxL3Ij`I~06 zM>zoXS46VCGVLz4;(~X36W3g_2*!HdTz8cVt83xx{s-?eP(BfF8juV-3$)y1n~Z-8 zCN}`ZoYZC4cSJg&W!=j0eG*`{e79XwDks+T$Tc zZq(p9qOpD;hYu9rg_GD!=Lr=GaKKL!pE3A0%q6Kp0HbsL#Pdrr1@WQJV}M!& zldJFKGKk~8xhpO|tXd_@*Ua@;)3Zr>TN!iTP0#KGp^y*dKOi$L%AzUrHj5mo*|V}) z;`mp;*Xc*IoY3}43IjjmLgp9biWtM5xZwskw~a4?k;C^bi!;8$PVpf1piH+boRptP zq5S!b%Sw>wq%5^srhiuW{8cQyFTD2mXwQXb@{7f-&IM#lZ=_(+9`{nV*9Nk z86FVao|KJ&Cr!!(qtRHrA z#~5oJy^rvR>YBLq#)USF>8km6OE|6zP5C%Dh?NgseJw+;<(E&>u{@YQ| z`^sV+LZKv71*%O*OEEDb!32gptxmKGRq4iTuVBK1B`G+Eu%Zu6(GpXFH;y^Pefku! zGX$OfJmc(ihXB=J*jsVN8^A>AKK=R;=>rbuPbb}BLy-PN(BKLKpz)7Sp0RU@j?q>)F}S-CO?cr*iADKvCYq?YhD7J%dL* z5=PP@`AETHBlx14C)ymy#SuQH5=$W$Ki)v3n_glsca9*$Tp`iGslRHJUkC0=Fz084 z2nJTk{gk|M210JDYwGK^A&yI8V`Fm%_)0(0x$P5w6|^w3K|D+ahy92KfFtyh9%eU| z`gsAV>qa^j1Q!~n`L=M!xIUuyr7uF?=f}i?OP)^40`W2~xOSiqJvjr1M-FW0zn;2Y zfg1=@0FPSS9k3$W&&MIqP>%FFxTk7%sR${$hp5fD_;WWeKeq-)OXNKDS;_tPRA(Xe z-WYT#&QlyZ)8<@eT)haH$)H`zpokTX0RGu6+^;}KTXF z6AhEH=}6mSmkU9pa)A!$Ww`(Y5|dGv1oDoY@%c|2U?lznaq?UajKsmSLxJn`%4!X> z5FRTt#s6R`Z=zT2huZkA8T!Vz-SW>rIw*UX_; zR7Oblw|_X&lR*BuEhK0&V(kDRm6MR#OgvJO17#ky3m^jQZM*;PlL0QNho>@nQVgt+ z$kp{1$Miks^b{EDpe|dlFV_opj~#^CsETLo11=W+TdF@c$N;oHP6(`|=7%?DEnz~= z8P}TvQAik%GWpKc`>1yKbnxsD@t`{B%}@l+^gA$i1efJRMi&7*V+cG(gO*tfgDN&q zL;&M0+7_AZKpatjt6i}UF(v<=_7C(yj0*A$Ovx*_H}{D;7}BuX70Lfln6BSya~yQ^ z-x*SOGyN;+|MHlf3$ z0vf6?{@F1mQ@9rXiQf<-@taN&k;|N>$WH3w?LsA!!>5C1u&215w+3MQ3{HVdnGbfg z_hNZ8;F?Mwf*8b&`5{mN359vxxqdAXaP_IdvOaxnnRXu%7?Rf$?=F)-u<8)}oizID zQSfus;l+QR2B6v?=+OJ@*-=hNrbu2Nz@4N=C0ijs+%36snR5p|9Xtc^sDeHmOECjBkAWI>j)OwYHoC~7w*l30=@Fh!;tBbM^ zAKWi`^aMD?T}TFy$G-Cz5~PHm`16#LP6H-;i8TzxS%K^l$S?+@9KbdDk5BOatMCuw zQQ9=fo5%#z29t$AD)cZyu4n`%nf3%!_OGYMAR*G4(q|<~%)$j@{jUN3%djD<9)Llz z>QCVsoAcOE$%6b2g2x@&PFb^JWcR;SkK`R)qbSP7RuG~$& z_map* zXK)*R0SoYIQe%N``UvQnOi&^kl#vwvDN8_eHjFSFG-Qq3T>zHe8IkKjxC?ZJ3~EAG zK=ZAUi+l)=6&VaB8YcGVDMZ#l+v2FaPd>MW*H-DCeJ{@$1A1AdBY0V!6Nh|4o*{8L zNS@I=&n6_aMtqYCp`8HkN|uIcEs{H(0JUTVVGN30_6BXuI|6_<@zAd~$i_QSQjJbEOSw15*$?fFxo- zFOh7fMMT;O@;gX`?_sopHV6PrSxDG_1$$tv;uWn96AvQR?_kVyR!Fc?+ezy<3ttJg?$;tjVnIw>^mexctDh#w`QaSc9VPFslo$>0|#}t7l5J(mB46tA1 z;6WWMP-h9u$?Jt&8C=}Uz>xK}KNAJtgtifInEd;5OU{>>UbQGM|k(hm4gcBhQ zuo1)~lXoZw_Mi>;&IbwtG4p9;U3pLV1ftHv2>z(pCFFNi;y)1v-a^KLs2!L#5J~QV zHm1D*S=nH&G{VmwhiM-_HxHRiEmhdKhP|EVKmrIX?T_lWAsg#jTr}di0tO(LEjk;_ zjTH|FPSjcp!t0Pc2HOZo2;oaG*7qI+hftg@Fw|IWW}1hn@33l=qa=r5HN=!*puvZv z*a!{aomhqZuNLGCumyAbx0Sre??iz5g&3x`!Rp6B+!!5o!HNou0j!l;ayH^3vcOMv zU)Q1oQScyZ6_^1raIhw@R=Q>6q=EN9RsUp1(xTv9iC(WU@UBbnEyU~~S-;CL0K^=@ zp$y3Zt!X7u%YHe>`j($E8)n@Cep-D2`|tT18xFH zrNMR5^$8cewF3eChz@5n63QSk@Mos)awOF8uiboBtixwhO|WPCEGdk}zo?QMmQhKr ziD6-u*^>Mai?Gn-RRPP!Ia80POq zB~9vFY-$ySuly$j0JVd}){zh&$_6Hb#hkyLR5-PLz9EaY;$D5aeQo^WlZ#EV1S>A3#gl?c=7diqnqs%WrU{(}>tJrhf1GdzNg4Oi5ah0|YZ{UUo`aFR zBnRkAU@2^XbdxKk0NPZRh8<#!qgx0@Q6W=$O9=)o1Z(i z&Vk<1f%qs>teWlzAlZ>?!2#G&!TyJ+YS6f@d=5i!k0286@F#$DD9{k(mGj`3=gH?s zK~KyT=!;ra9w|P%jO>rgZR8Yz#>k#QCz8Vvvw8^GtA`Q=PIh*~UTtN8#qO4g*hWAm z?t)T+&M>zbMG3deoFXf zE+M%RB>16MCK3tkodgL=o!}=SjD;*dMwf{o_nMm#i_u-!06`vynjWD@VY(t&VsYTR zq&9+LCa64VV0gf0wVIs ztcXZZ#Cg>-d?y~Fp2GQ|l|~(hM7kl^g}(pf8WN(q1vdGqLL&kBfk$mnL7cZT)Esff zA4a>?%}+1Sxdfj)ap#b8B_9ruoZkEV9EAL^*D<=31i1|E>T$&XD|3U%P-)1Ct$kX| z*=Y##>|059h0|dR-ri=3MnEKQU@Zk+)I}U4B!G?Mti);d;V>cL0Gws5VFgzFpc`Nx zQ+Sa*Z-uxhA2*#Y5Q`wWi@Xbjy0PBH!RTJ+zcXLl_ELB7QmC1r+ubtotm*Y!4XJi6`*xOV&pXkRUj%4l@Hm?tQk0 zc67=Fprn-1sH0$Bk^F$odj$&w&V>O}g*kbXjCci6*u2|>x?4+g$}c%!j`(SM6;4Pw zLZr-!1&K=Q!eQAfP#jGa$bh8e1s?@qo=Wqp1*2du2om2f1S!nI%4-##VgdIoEdfRn zE%^ASd@>`l@;b2cDIS_U#D+-ppQRugQh#9n-E;rAD14-_@(12}@=?U{&=TqLmrYZE-^qJCbk@rAi6C*034jSb35{8g>h$Q!$uC z>h+Vj?LtM*p0w&^AMS~CL9AkdSyM2I0T~-3p{{0aurHl50)M)=b~zV>dJp6z0?j=B zn9CpXlgppF;vMaC_a`i+qNyX0)q|D$96!kUoDx|_aX3yCJkpN*3py>30k%69dtf!! z&o@*qW;7GwYGz(lHqL`0&;xP{OUEz(ZN!l8+k>&tE>wK+7VOESN#iFd5ktQ1_Ou>b zUl)3n&WZ`brGUX*ZbI5_^=859-=hsXO zsln}s(H&Fkq$Mv!qQcAH+s8LF9ZQ!L;5-wIVi= z@jelDB@F$Wa}1HgO|y!(>hE}nwcsv+(WP@2oMglPl;`{a8zfy{N{j}w>tHo_LWp{E z*cGejtYl&BWBJ{4w|qnBPj(6*(=?dZ&GkAyc^Tp)-~b%dQVAgQ;*z|MF$rcvKw$H_ z2IBqsJ?@$;q=5(Dp`X{q;LH$T_c`pB`^kVQ22Uwa!1gGafJQBg!3qrL!APq&hf*D~ zz&9&aaJdoWgbPTOMeW;c8p3Q&#F!C*m338OAqcz*d&K-&fBw~x2iNETmJB~g1Bu&% zzg9eu(m8V5_H+(p%(wy;g}4SYBGFG2SPCvbRy=qFZF@M~gicuwB_e$($?N=!X`qWI5p|=3WeN_${C*`hX!rg7 zb&Zx8+y|x#KUP!aPJ!FA9~`iZ4@X+{o71Zh6F{iTuSKdBh^j%>MP(dSmfNg(n3x5y zwgRF}4p=Fl)tFSfcL_`^OZvd^t$S6&escma z9n^aAJIcKBX!q*-11aLw*KJJQaUPC-m18H~x8F*}aWnC8_l!M_39C4Nj(tx8{ z7fg)}_lsZn?hLgOgg^V6|GqCPFo4-t&t#MXY|qkIAkI_}G4JQ$KeE1{>@}vVd&to$ zpxq1tqAQwxVtUf84}uyPo`)kjWm51+VF)r;MEnT3DH)sV z^`c#fY7QXuy-@XAj^Hifw-LD8=gLwsISNER+=zTg{Z#zKCuXdX;gf>Nq3zFqwWpxv zj|4?TxIr_CY#;J$^*$#fOoUaR{c<{H^uJ0nMe2Yl{eo3zve_ua;%-AM-7It5r-c_e zuWmTO(n6KRZ;w|J*#`pCxczUlfhD3=n>&kjPmhI7B4(ROVIL1rBVbyU;G2QQNM(?8 zEYIly=;R=~Ki-K|+$LJ>)upOgNofSqk&~zWXBJyKp;MZt5OFQx`h}86hpdrov@NNB z?w3IEE8W|w=_b}leO+Y4KTEFxb9Sr7l%a^rgvfE5LA2I%IV~$uU zxNrRX&=EGKArMJWWa*7wq&gZi3us1jVCt|c@?RXTD*qz#S!eR5QXOI+)}9bye!_jY zaMa^Q`a3X@_`1-Mf7RC36GcYWyXjp8VqoyG77Jpp?SbW>2I{$Q$Rpg_Z;To^Jt zbO?0_p+f*)jp69a3-*SyhDr?rZUWX0!8h)GCtVx-hbtIvV{=>-oq4!u}SwK4cw4~ zw>=;}FwF!1QziK?gBw%{VAfqo8zFKNGOjFI>8l4Gwy$FG0tpXk*=!JMX@rdkItS3n ze)0IPpNDBexC%VWr!|%WNN2No=nqgHNJKXHr=Bod-0RxiD76a|EbahT8G?GKQPI#e z-M@R!vMIcWvXc>Vii}7RKB3Cmg$9bFJo-tpzIMP7`tEcPkN{BrPkMobGwAWZ$>NX9 z)Bf6zPwZVv@{A)`?MovN6ozjZQcU$Hhwwk;NB^=;)BN!JUQc}v_jn@}@<=KT$+q{% z-uXnw3LV84iW+ILzIOk)0LYI9zTyIRh<9cKH3FHobwo*E2IfdkaTy1sJ|;f`GdX*5SqGq6fAgz!24i7WSn0ig|+Kj9sYGW(@4INM?K;Ayxhd|D@ge##04Js1G6bl!acU%SG!pU+5Nqx z{R6ItSyM>N-s(_sHiOQn@^bqlOMaGx^?TI_0T`SEoI}7CZWl~*p$IayJxH-?P6|?) z1~v2wMgEzI?v#EU+be)HA4I?zIrzahU<8JsKGX5aL+UqhRw@150$3y6>H`E3qRJk@ zTT;S9xYl0W9kj^N+3c zv%dUA3Q)&>XPAFvaUX$vBOJa+uS#TW{QiQoE{tE23!!Xi#xJlW!{TbyhRGt89)qD9B9IE!S1C!v>7SyLJKOqScme4KBR7+ z%+rlV(+j~!1K{!Tm$UIa(rI!gaw%PisYdc>2UXDps+yiF)l0}(vF)n0wLaouY6Kk> z<^eTd3QZ*iCDnYhB1YxAs@J4_0mvRSHx5-+R`9#3W0~AC_1LZ1s=i#AwQYsEGb^T( zxakdev7e|ULpsMmG}MBNND8}G?`3j}d(~D!I}J5|gzN}hjgz{0Y3s7(0bK-JS65~b z!hkWMUn^fPjQac@Q~PC-IK|=p>jP78U7p03C8&hkdmUFkw>Fou{Yc z9DB=;*wWvbuBoKUq`xz6CLlXO=lSi1_d>5G<uQgQ4Oy7=>R`L*?yMh(rJ6_=a^=V0n=vQmdpZ8bah=7et)xB&M@pJoW6xU zQ?oMAfYRuOV2T_PpGiT}>=R*K;XXXgKoz{0JE7L7cp4YqMN^3? zfJ4L`52^DoM&9h&aG_*ZlzX9TNy_`Lzj~%R{MJjtv<_wGb_d_KNwo`YAMo2*sA=Pf zd3~f_ILAMW9bYXuxHagwMNekU`P`Hc_O4+jRhaOqb3j?WE7_aw^CW8cWQf2`?52q2 zW?cy;_R!QTQceK0%0_El5)esXNM33BP0*`|K@OM(d!Hs-JU%?2MF(yAC8X;Lj zXF6qzHS+Tfl~uh8z$SQk)XDbN_uTLwMLfQ2io&-m=*)3I)8U9>eW$X-Sc|%)ROSkG;fN9M^@?@?0k<(-CDrIqe!;;PVO1;(dv4mfJher(Lf!ARS|fNfPu2=5$7b zYD)br2`a9Ac`~-1Co%22V7H9lJds@j_{Ya-IJ9z;VtMuOzpviK$v- zp<^oQQsWyv_P`OH-QIO=Gzb12CUa0LZGtTCL`8vYuN?LV>D&BUQ>2?+rq&kTbSL=Q zt~y}co_x-qGBhnTWx!>nkmVs@zbt zkH-M2%%=M$c#SlVWo_<enBQGDLKGK>o7hA zhJoCu-^N~dk|QDfN1{-{3MKvO56Vah)3bZ9F6i)JoiJ<8@l3=(V{Zey&CSW@i#OPu znvP&^o3usdB&?3oNS$%!Tg@E1uA}AYV1fN?q|wltr1^8>Y}q?(Vcirj&9HIe-ZdT) zcXW6TakHzs@z8GZaCwkM!L>xzpYtmt&{7ubLwR9K)TRj!C?)Mjr2Tu(E>HFI`RjDC z-68pCybHZtj4&5Sa$JprTbHkW%=t2(^#qmbF5E$oizq}QDf?_NEaHAe-e{m|dULa2 zz4_XxVH8zgYu{EjW(~`pI=AW_{RLTP*Wo4I1*dgXxiapu$CrBHZIP??kBC+*6Py$$ z9B*9c=Qvc&EK~5M>-MKWp%u$}TcUaB@N?C9SPc?#og~|epxfUkOgW6N^Ca+wkD4@~ zzu2nG?pO0nchnc)v=qDE3XIMyHXYTt=4EKKvQ>{u!)<%*rbq??-dl6KWt>`YN-1am z;}%_6|BmFx!6l1~oU_|F!nT*r@7%WxoEEM#SGQNz<#l)w>b5N-mUi%wDKF)g21LOD z@b^WMVB|<2$p@2jt}!Qt+hRpCBlVcBOhp5E!D-#vQ5nV5cd_GBQ_dc7(H-10Plr9> z?6B=UBS3OAaZk*M)4I#mJ~30IQAIu$+FQTYDiAbHS|iRIran#b`n^gZtDc{=$;+`K zIsLBNm%nq7eLAn%AgrRI`Tt+^l-KV48HZ;+9yaTd`uB%~1~t zqEiCNle0fcE)cdAxp)Fn;&;;iyjV^5nwqa7N#3ee(PwQ#)(kWSbp249yl1VJku&Zj z4L2_Ogv`-&xtbiznsvaq=|;jeWDy_A_oxwAhKO}@TX!{l=@m5U#;h#dj2@Ma$A+B56+zCiZS`Tb829q#!rVUF+IF}4Bt!hE z`u1gaod$sfrP1HVi@N6B)@b%WW<=HMdM~wyY`ccDkMOWvdtUf+F!6qf_CH6U~DrIHuQv&NJ+v~5Tr8Pam#EY)Srausq zcGc4l3vqd<^AXUI9Hj>DQo2snyc;=TqJt?XWktAp?q0%cyrbDaLPIj|%KIkwkZC0~ zP17(59>LOc(AMpE#G^w{q&sM;$GYtyHQo7c%1z%m<`s08M-#pB+N3DIGt;!DG#@fC zN{{J!pMaINfJJXoowUyJl0;C=*yFL1NJXImo6=+;jYAb(=j`PgC==LbG=LE9$CM0=xGywFr zV)iktyU^|_A)(j&1$>RG`>g-|hQIWxG%odq!@~$!b5>v0X>C5CO}0xxQ=|{QxaPrE zTe=&3WKGo``Dt^(dJ61#2iH?deg5Ulv8NVSuMVsNC32e94WV$}Dkc3pxZR$dZGbbz zqovP+CxpH$D{MAN766>AKPAq0=9i>@U-xL8_Km1@U&jzca;f-Uo$^_*HJuZ?EswLG z8LqJQ^G+>@Dv*^Z(%W&n&6P0xOrv)w%q{l)+D6lc7m@C^jPl`a?PQD!&cGx8dHYCe z38lb&4s6$&is$ZVnT3Y+&qqlNMXQz^pKoNTj|4cYNVwh!@6j_J)k&NcWEtC0kx(?Q z>2-XsvKJ%G?srKxL2uNy_TZ@QU0AR36N)u?+7-@G6j>W(`jh=lZ<#61btpH zY|O3W60GXgUB9%|r9NPe<{3=c-H-ojAa%S1Fwz=2^XSTvi|M-F9NKC{Lv5Wp{-MQe zv%$qO#jKhVmXoGZHHCX;_KjBhSc+_3m@@Usaq;gprb6WOoZYA=6s|NNJ34L&2GSZ6aB@mXn=MH0z0A zU2x^3A%?kpZ%s!h)07qEW}L*PRuvLcnNc@+_O=|1h_pTY62_;b8RFcM)7ED%(F9<1 zVongqlxK)1MAE7UNA2%)Bwc0`Kd0M(7O^c_=reTIemh5@Z(S?ITf^bn9ndplI&aby z?G`giqBwGXBfG55_!Spz;?hJ7!2?0TMQws>Z0vmsN zqXK7hN7q;LfuxLk&oZ(gx%*0{g@oxxkj37Y zt0e1BpU=|hE93{J+UCwl>@8{U3^y3dA0^2#u3OG@P2((6b@B@h8b=p1f3L=ixYaq( zSEp_JvQEeu<}_O*E~UC(U6a!b+sRuqT-x8W{S{*DtC^%pkjuBbNg&9Z*?}G!H1%@1 zlOE@FPoXSR%gnm5El)4tYkK8es5JYUClnx(?eZ;J=!q`W&L?kHa@OLx+R`FrbjGf& zr{{8wj{L%gPZ3^O^!VCawfDwi%T06l%37uScidLb?+UdBq+Ti|;mFFy+eWT1IyJ(9@SNb6fJQ+DAvK^lpE0U z0!%L>&)h0h+VbK3+UBP%jZY3;GTyOwrHyJ0iD#ag@^H&$3r$-j)Xh;^@bI6#?R<|B z?_z&_G@MGz=G^?sgQfQpU5lrZhNz{PBm$PnthV#fOL1zina}#w?p@x#DV;7o7Z;Hh z`)u1@Qu=l5HDT-JIYR1Ug5o!nFJ=;!4~bhhpncm8*YzANij?WgL+x+mn|XCjbdXYZ zTn!OO{kpAV(XV^aw}~XrHO%|IVRZyMS4p9%Nouv%>6q5LE%olok3zX{4}r>TbGuwQ zSa-k>7jwrJZ);U4R3N7)L&(w||7Zy`eDtO3(G4bA9wnN4y8+laiDIU>?^8jdUGY1$ zwnjMgmej??0-|mEDpp=f)NI_3_6Wz+*a*tH*@}6K{OQW=EcP}G-7Lfp3KC8a42SZ~ zT&s%{)Y#RpOKx2WCNNn$V!Bjz0|=8t@E040Gzn4;G_B=qd)!FAQtzE5HveBMMCZy@ zp4xw0a)P+-{l{9N6Rzu`6rK& zide zxjpSO)!Fj;>&WzCr1!kiR-=cYc;h=ZJ!wGH4kk;s5qEm<7M7<=Frl19X5Pn%N!8AY zFQ`Q27QSNE&AAemXKTCJ&GALBaA{ib<#F{r69a5|aSggFG*`Q1JtcBG!@`EYa3@~E z@y4S)JhQQ%=`W^WkFd~O-bL;C-FKIDj84p@P}8R#7jsFPTMF&30t>@2CRMgt=_(f0>VO3mVTQ~>6}nqn^1)dmy0iMzOX zAKsRv#EI4#;P4h?((>1RLZv@L$fO=8J|;}CZcF;M$Sv6jc|`-yKR+y)9;!H;sYGBmv?N4UM3)noj>@W?jxbWMPWDgP! zkB3!hY}9l)en|(bWcSRKJN~G-o@?E2QCFFbC=TVj+kKhyVg4>1a=bP*uWW1m)wSyR zavUno3ZNJmpDz>XY7)p|r{<$1H?t2k;j@imL+eItSt6G>ebLI!e2zxn+9EDjW4DdtnytlSIn=VHVM_}IS%$IHb~Mpa-J{^Bo6KUO4U?hq z=G|~jY){nkAQ0SrF+pA<*Ydh7x0EM#3IFM)i+s^t?~$ODP`q%X`LQi;QA7z^Ed_>n zjP60`jf{X*BIM(m^hkt9%K-mcU{>0P2)9=pC{5E&S(1#DNICEK`=JUT7u=^qiuLFB zCDM{HkrRI7PJxw`BD48tb1r2bPO3=i@V8SXIN^V&a4x<()Gp2c%h|?J_wa7zB+U|` z8Y=zlq^gMmBR^DAeMAbYZ2Pep?eMUfLFQLNKMP(L7;j{o<;^uBgv3LMLmD13U5?uC zrYq~02hB8zZVdP}3)|3~^I!MuuedvEGP>e;1CF%>xgQ;0s0Go1%(#rZrfVF%TI}bi z7YAo!kQ2pxvo72=X=gQDs{T`<#9KB-PVj^M4j0h+CkoWR9S@Th{*;=0O_F<(v6sC* z&w@A)Kc{_v7YM~(^ zCzTSLgp%*zm$Y@_x^liKeZQctQ&W*1S}b~MqY zive=^LqgSyqXW3qyP7Q|j`)fnS0}q=M_X3NWSzD~_TV0|*T6T6*`PdBRrBe4%}eJm zjQ4*ho*wzt!)D-r(|tce2KQ|_v%|caxYQa(Jqoz&7(hsp7KJT)_9*l{}rxYP=xek@+D8Q)78^%B(5Sz%gTdJ=%Ojf$t> zp=G)58Z+!@R9^E+$JSTVt#a<+_`=jLG6kpbqdRj%AKJ**Kd<6OL*f_(jmC#g+Rt&* z6gY0V51CW|t>gEGr=zs{gZPoI5apD}zA)FdkNLdE-EQb^R0Uvk0#XEHQ%K)Jbjjbf z13SK9(p;IV;c>&bq17bSJp4zbg+tB7p1Qj$mafffPnM+?y-(w|4R=rJS@vYsIPC0Q zkM_d{V1}u;oW?(5?kV(Q@9XJ!74HBc#S);q#Byo~qEzKaOeuqhF$ijQqR{ z32Y?P(eP{0L=(~)(DP40HSkG$1Y~X&{Q%o6*&`}lUZkq z!5_LN$vJj-FtPS)U0RFD?V4&*caMBM6BT?)!5{-8=Zd+D?UOACr&**%mVD3*WUl{G zaIb2eukr+Tm`2Ip!nkLRXV$o)!#_`CCbcD$IL|m)xZNSkq`h45C41Rk^6g_A%;K-^ zu2gH_rgRif`z02t;^cH53D)Pz=KE~Ar92(E-kDL+0sYLe#abPsSC z!FKuN`qt-dJ&w2p#@~&{OUzcqO8;5P8HZ5Y;DonXuz?6ISvVgi)@R|p<}U}+tFuK5?<>P=ZQ_bEL=Ae5U^e@xTHe(Tyv_VP&Cj296W+1 z$m|(^O+RLGm<(4Y9NJyR{@W#6P&)mR!}E#5qzS({aHAu(?=hyiKn|SNI3lihh+Duq zBP&YEfsJkd(#G*9(xvY`Pq*K@O?&HYKTo{7(j*f$Q%`((H?3xZ>_n5OgZm>>bwNU1 z!{Lz`(M8P@BRzLlb|m!h<&)M!BoV`9eA#wjnCq9WBHER9n{`U=ljB$MEAJR%@O$GO z!_MIaISV(9CfeOfcG5iL_&=Sjzq0Vvt$BcTL6zlD_LO1ag`RPyE&IccHHS2XMDrMy zpP<{PFy}fo%9fpRE^X+i==K*9Uv$|UI`|R&Yho-`i?cJQPGy*~V#mophKL9#re6w??hez7X-6gr*-v?i!jQd5U~=CaeRCo#-^ z&bH^X-Y&12j|4gglSvs1XWyuhv$>ub@TyyNnEH_3iyhan>Iw1os?Rj zTV|cPFR<_ISEqcpD>k%&iVKLd=GYYVYL$0~UfpjLXV#uxFdw}FQXd`~)WJd5FB8po znWWT<5^3Z4ln~yy#9Mea-1gbASPvo-x&EJFO1{v)T&~h}%TO~(aA<#Cog*xtT`7$6 z%%%+NY+Co+Fdz$%^DI4o*&2N$QvPSG=-uAbzC3qmT$Bb!Yu6RAD%t_ed& zisB@lZauJ%UgEB;nMMZZ!n{_|_> z>0Fnzm}O5t=Dl*4avO@rm1And)PTrIFB4szlB-KbB~U%j4y``@)0O{~gZs-n%S4iX zN!lDPB_(3gZEJcoPowU76x$w}A57U!lc0H9_B?m6(C>8_=X+av>xS9IQ)tVJdj5OK z4N}H~h3nh?ap}oS6qORR{NbeQ>iY%0i``}C=ecUeefrdsHKmytfqc91I;w?4v`24^ zo)o9Cn!Wd3OV1tc3E6#hVDQWh! z!^p8;k!s!I8aQ$qS#ewa5~tl*YW4cP)UzWbc?Fd}o~oBCi#jvt-wkBev*@AH?r~?D z#x4k4b~YY7Qq)!U*`z+5tl~?M|5V- z*tmN1!C=Ovk#w)ZHNRcB_&F_P^O|upPYzYC8GINWDPmByoYKi0{l@hcZ~JKek;ONa zjcvBvq^eP~(*%Mh-U+6z>jRd}uV+f-R2l@@3$^7JkLZ+{zSEmZok>5^{zx-PH8Npe z#}=+j5{-p(!tiR}B=w<9Hx;q?&`Y3VuXznA0>>5$Y8om-EQc)teb4L=BQbkkaAd)$ zIjZ0$H+K3L(f+N1kT)DXnz3~y$u0@OjBXzFkXvkBwQR~#$1a>8;?>hRbZzQkR03ri z*WIa^ZmE|Jrae8bG|%y7k5(QU++U+F7}{TyHgU+)s=*ESr~3 z+th5w33jyP6VxZYCm>B6-_V>%epR7strk~AYs{J=>q}<1ph{o#RG@h@v%IER-#4~w z#6*r}YRcQ0Fz&;TjgzLl-xrpbS~y%Q6>#QD_?0w0so^@`yUn|6s@28Xkvsxs>AQZ1 zWQK3-*ANb@rEw7*AGpgKl`Co&az`|Zr|p=_E|f!T-SV1%#_;%7!5QB^ zm;6H9*_FAq3b+w%ofs~c-9_D!dE3;p8)jTp*PfU(1Od5~ns}eRqja({%kK2vqgc~S zPLF=lZC^F#s`$6xs~bnb*~`vT+^vSBZDviS#tcKfaNdBO53kuD?mc2+*NQ!8LNGUR zY3reZhnE}g=>(rEOQR2*w@Nvy1)J;dRmq3#ub=S<+ljvYGvTn_Ls`bVkK-;ceG|?q zZfr^NS{QxpzGX~zdRCK3YIX0yK$${GaJ5%-BX+^xh9{^-vc`Ee`%qY4(-T$aS#_z& zXn|d;fO+X;r>UKtT_yWcN!Q4UMPjAeUd^eH;Hc)LwV$moT#nLK^Ve5?6x!Q^5_cUM z2DdQwlh@5Mzj3{wq~7)@G8tOS-g9X>qC~8Iy^>bymvG^tK=nl5cjxB!Ni6r3^;k9^ za8TVfpYG0&=O)8(q-}ck7bKx--PKE$6V!L*6Z?)i%M*DRC(RngJ((qYWYSP4D9zon zDy_$?>3*&LG|Bzp29%pEX~zD&2sxsxV_GIO_jjF}uD?5Kdc!`q!<`Yo{CT+|@r;uw z|L4S5-1IlaRPQ>ZEO|?N2LLy*S9cHUmK;i`_?`t-;lpsx2F`v55!ewOb9)(Td6O>Yf_AzFFoa(iMi}uoaess8jzV%c0&Vnuz)Ew~JhnJ>L z1PsDCd&hTYy$lM)2cUDDsfRS0_5oISTGy-RjM>{(fm~?7F4Sl5DUy5Z$3j3E5%fMUWz zlrdUtr!SMuVD<*#lRy(yj#cynrtZ_NIE?9??-U+rT>3y#AdRlKdH9`}xCk^+K?AJ{ z?$+{yvpy%MHQM-UUWtSO<)R?TwY=4=Of0)^UM6{ab|7jdB4_L(@4l+69)FP)bSx`) zeZSqnEmUZv|NL4m5c`Yk=uSmO`Q{$L8lv+QMy3QvScy0^F8BcZ!Q$S17MI5={K-tw zwLpR2d(Ry=kUj+{$#ctv1-0vjf--*nYZPjia_WrYOc++hx28i^hb+XX>I;||LbOQ< za(_&-s6YwKvm@8(l!HyvyHeHEXb8=z6>Q>+yo@3dP`G?&o>)zuHPQDTa^)+tJlErX zF%h2^KhD>7pPZ1T;2cRU<6B+MN->v@Xi@j!*VlYzAKo7{G1O~DhcgTS*~arH?o(9} z>CHES58ax1^O}>ds{P{Jgn{v$V3aU9w5A+Di&o`{8GS*QG&-Nu?t9Qss9H}-q5l{4 zBHuZJ5gDAxYFDbHuGg822ClA?Ue+wN*lz{5(-r3Iftc!=QmU1L0D1JcraPlM-!_8E z`R-x7uAL#bd(qcX(iK7&YirvA0eKykUhdWpsEn0$2ac3g)g&?+PyQOsz!GJ!bh(30y!qI1( z^@96+zgcbML#niZYVzsxSzYhn>U#aYIcC>~A?KZp3+R*kN-eNE^9IE&b1JK58X%7O z*khRXQhfS@C&8#EcLapiJD=JGf$U7l*BA}|2+S0|L@W(uo7w9v}X{DTy4<05Kt)Eu3)=0f@lLxCQC%_ zsFDI;)##2=v+^3*X0y8a`<2%l#cq;Tv%pzB!>i0}cdmPiF%xnVdi{EMYy`LVI+F=? zw)pb$T=8pPLfwJRqxnG8`!5jUzdk-dDl^aJC|>_NR4Gu7{^bV!9h#LA;g7kg6Po;Y zKv+&zsB-v=And=9t_%+WfB!y2Km%8L7M#!JUX4ItSEn&Lr!3(mDe zYRdlb>HarJ_WE&{4rQthG0+J7wRq$`8a}%Z`@GR#D!`~r>?*xM+{$h3guOAM457WUM$DHQ-oAJ_pLdqZi@4~`b z{@;cDhfeMZp5lhK+=W85t6dimX#M$HSMRX9Umio+WBc#U znXW@Gvw;ghXxxMXvLq|Ot={9_2Nt56z?e*3hmd#mWojg~zOit;gplyr4~iLCr(+Mg z3pRe(|9;rN2;nJ1T8Ba5gI>xCLQ{v|i}nNzXfd=Y6a4C_@Ttdcmm!c@WG~#;2 z{{BLu=O@eu)@F?%4iXZr>pF^8IghNF9(iyiPFd$Z`H`EEYLb7Lv~)%a8(&Sg5 z3d^5o^sgH`w{U;R!Q|$wKo7bz2v4QlKAfW;o}avu<6SFTHDCN0e6h-u&PqQh1iMhR zg`j?4O1W4t%)bya>eKWFW=A82`SoD7y>Mb1J!6#+*7`l* zORh_=so{ktfZ_VkldUZwkZ!Ju!0cXAq#@*e z!lkZM!2w5(XPJou*Xa)^f_dC45o5OYfyuPVDUZ7lD#V^Y=vsRdPc8Ui4-}?}yQ&e| z7xp|WE+87Z#^^mF7tc@4{-<6-oGZW#)2QBPeqdy+*nwn5Fk~B;vV?wiByq_3!F`_9 z{I-Eqm=QnLvGy9!0HLf#VD^P}YmRCJN{~=A@XW$TS5&_h@5Hw_r&8aQo&}8OWokaQ^E+P zfXUdKjwvE%T+Vb{oXc~bY{g=#K+(%j$ocv77POuKi0dS@r84z{Ivb2sBJu-o+MqBj z6F(Va+6ql{n~QH{T)cTWP2{oP8}i($n=NzNkTNwNV5Nh#J#6Z?WzM|rIs4nk zsaxEMCClw-2ek4^8HL^;=Dn5(b)6^BMQ+@j^wmrNCQf1K20PZzA;3|b){0bz^i;jP zFO%nko!>&}&2~P1O#GJ`|s3gw8`z`OT4}Tre1dh;pbhw*%HZ}e(NGM1QhtF#O9{s2~4uslfT~x zENPI6un&IlhE97598K^2V|GiRpIv6F|JFd#gWC5$UcT9*7>N7Dy!|S(o~UY%!uvi( zwxv+lfBRL&1;=l1r*pTj?xs00il9F)7YCGD;m?HGn92@rcK4T!*4c3j|1m13$@$ip zfhYGsdCWwQ9)E1+23~X1lXxD-o6Op)co5NJoESeYX>ni_;L*33)$VkA>nbX#ooHN1 z3RO;kAz_T_-nQG*^Pg`;2(+J0+KbH3eIiqP3{@>(UfAcn#T(bS=ZhOq@Krm!f8wY& zMX&ij{upG?pi6vCEkZy_uifRoY7k|8wgAH@${Wwpk)@exxD7=sXB}6sKGBv+@*Q-Q8?g|#te7pM%E7gwd$nf|!>Vvd92eO2{LS&>t{xOQCDix+P?=#65|n%=5M5|R_^J9pJ!+dp^)VtP2K~_~l ztdEvOh)DAMt6~QrlCBb&|A2w^Sg9DE&_&gI=YA7DUCdVN^#=Y(^ijqn6*Cg;M%wK-%Jv2KDu(YqXqF3tM)0yIPBq!BSusc0GMvcx zJt+P|8mM$ZQ4$MTJAxIMe>skwzt*|sKX0G!zG^)8huoyrTJL)wF$C|68B49G#Iu~` zUzaQOfJ?KEe$3*zna#y#YulgqhKMy#hx9IWqahaJ^!ixAU7!w#-fGdV$+sy`w@fqUCt~nS~d5^J971s5?$G zpPy`8M?6s8H7@t{X@CM9HL!N}-o2s3s!Fj`K#P(O<+2nJ5tZJDGm4gZ(nx2GjS~Q< zv!;!*N1myhx}%nSbu}KFOv3NBS6I$~cCVu8Dkv+My$g}swn=-mw8Gwho;h*NhiGgY zNkb*y@=$Q^vqItgujO@F>!<}QP}pusQ4i9p`vMH1H|1wpoeqC zU<>F=y+{tBL}&yd()k4w5%?~l$tt1sL5%i`z{RQ*ju0>XfMC6SPTIQkOiAGEr_EZ3 zLOx8#dfd3a_#AuxyeB%BZn)2fYz4PV;}Cuu8u8sCEen6j(!PB&umRv90Ed+ zWB236aW8Y8UeHdT5Wb$x#~{NkhEd6l-d(h(85x^Syg=IG3r00w(0%4DYJ1S)UyXo1 zJoz#Os7^_eNw5q1p2k4GcPjSY2NCp7>YqIzKM8urOQ`}XE&HPD23PY`1&o8aLa2g* ze+82#%XbI%=KT&KDP8zNnj7slhRNco#$9Zmw6GFwAde!@?@51$Z8)W3J7Q=6K{E*E z|JJ2+q@cYYTS$Eaf}K3PqDv1tfq6eY*(x5RZO8E%T@e9?*zo`>?u^UvXr~P=Zc7mw z&S=mz#fX@oZkA<|$k*~aMDKme)D;dN4(te~-cq`l5|7%BYO?J5Ix6oso}Wr&Ql6++ z14MH92$KG4q`upc@rTJ_Rx+HCvm-cvIuaHr6n%a+a~XEYb}$;oWt`R2opvdWx`Ni5 z@$(+<;@cd{W(@k4Y4Wk7QG?*~iPeRp29Q6lHI*vPFIbA9lNt$fffX)_<!mJ*3sNXbKpQA(pl2x7isgYBlB!)@{StFV}$I2=p-S1;X+6-l|~69 zK^0Pd2?3S@t^PW3WSG=}P$V?0$&x*3SJ_c{mZJYBh&sLKTK)0&JtlolqK)#bj?}>M zWx&5m;--t#<0rErCfpTnX*Cq)-@qVl#dC8EszD3N!}%6^kp`z_g_DGc?_20vu|GEC zJSr^gOEPYF7&VQVt%Tr-Ul%57k9vv7i*RR+FFK@}-Hz-X_Pg4`z8Y2EcB)rB@2&mj zu(j2vIh9lfabXX!YS%KAq+wNyNcLD0J;J`8M@wl&=2(eow5OXR;J7b?#>ndyq02<_rQJ1N-k+jrc|0ROR_02kN zgszj5;+|&^Je`p|@}9)U8nJ~6tYB)&^x%-bSRB}D3ZHAoh~A#PzC005={R2D989?S z*k)3#PVODILKxb+OC{0=0Ed#yWw%dTHu6UHKz?%$7!=>Ux=#4YY1_Y29S>rQPHZEV&nH{;aW-*babS^)WEc3`F?h=9u$%ZY zM|Jj%HV8hWcCsVpP<4q#rS9_uZSNf#uCHS%@l+aYi4n5TnISi`SnWSpVaI0TDCKDr zu7UFpj3N$=y@BAbr{@R{lWGl&#SFOQyGIuhHvPgPyLIHnSGI^Rke)FL^3I#@f zrlUW98Mj8UT)dHkX>(LZFl#tr0UE`0zC}?wI6u9szQYsIABw3{x5XZrDDx((C^d|_ zmPWD81qg#)rqPrI^&NHhbrTmEX{cFKH*yD-yUABmaC+>?}-3cvWw#)4-TDX_uvjpJCygd4(WII z<{gF04C-nj!>(iL-qkTlT33pupr*Fd>8WGtSlgfQ~E3SgS zp5P)cj5SSeX9Y)?ibpG>)@WaIIy5iUbfjhe*2B2L4`Q!nrMs!i!$|B@WZ6;QZ2k1s zhmRK{8V8+u4&~FKhF4U}Ml;W1odSjncRijO>(_%+g&TYVs_PBi*BxDaidd(|b2@MY zu!Nk8{PcTis0wklNnA_{5cB#m(q7a(Yfb_TVH1V ze&{+pigcozcCu<^lG)UE_0IYmGEpS!{R{}>9H%0)m(qHs5d7QO{gcDbE^}bmnJKR& zagqu0qd(Ja0_9&jI(b(hg;E2FMo?!#bUU;9yMYWsjsANTmRAheX!Re zn7j(!By5~cyO9TEr7qWAR>5Dd{q{sEBNwWvz0abnD)ch-bfI)Y^-Lnn`LhZ%%#J_0 zSpgHS?i^z9>+yJ}NH5Wi!J(gakTh!xbcRR)UxUN}zy{v8n{_vOoZ};98Ubo-o08j60#z*p_ zWya81!r)eYf=za*+o?QEz&AC7&5 zKsxFtdwW)M$u2tA`!Bn zMq;c&l{WPMj$5OfEdJ#~aj}0ABNtX!v&|+ur1VYh5$}EEM2+*vt48iGW0)=mO_sk` zJ^6Q#=}X$uD_JBs%SoIOfH*y{-mKN2-9dTWu|vf3WK!9g+swf%LOPQxX5mY>4HjKj zBXnTn!6D?&J^7JBlaNKc@!CqX41O9pQ|PrV1b4!3csb4Pq}f|JvqlDWd8%0!KN+h$ zsNAg=R|92q0Q@lO0$X1hk1ak*^RN~47wZxIfw zY^ChWC82ioWx7pG|-uB>GAp1ug3v zBFcEm_j{LF`Bi&nP+kN@JW&CINNz{Gu#NO~=*U7FY*o+DX0BMkA1MAVH^lb=)VMDf zcc?rc-4`Eay&wP0n=l!>l4SEpoX*P-bSEw6<7x;?Cpueh#>vm4swIi7Uu5c9q}q@2ySnBCB1f`D z&XIWUxZU+uo`CgFwbmt4uGP(1FNs_K3@_-rhuo+Wf2yxfnG<$eY(vkWEKH&jYo)x# zO+Am3YTk%JPk-?c;D6Drevk4!Dz$6cqkpmnugNkJol)_N#PoQlkk`KXTNS^bJrLcA zQ@P`EFoW*2yPexxA2!m96s{IT7710qeCaRYw;Tf+YZBroKVUu=qZaw!8Y0mpw~X6j zH1H)Zj@wjp^9DehdgZ;?xQp^;EDulBJtvfM?=f?Twl~Z!-xo79w+R z?m3y{g}q)^cAVU<1!^RCP#ZcpbNgp5dRboZR@1}>bE$E|zsz7C40#O+a2Z0&bYJnE zL+Obt604xV;e-$qeXP8$yN9c)LfxVG#NQ4y8;yW`>^7);!3`3^=UAZuMdwX-Cal5( zq%K3aq)j%Q0gk+LN3ceo-!~_JusLvO5()jb(<@Uwe&z`)kH>SU~rqQ!T(c0+>nxd*^o%(%^^@FR%O1DXy7n`_Bk0-=Xtwag>nff7wl|*yY>FBN|G*O zdBIc7m#fCV!@5B^kV9A9a{XjZ|EB)ZbFMM@;isJUI>o3b&KVr@?Je>E(h}7PvQt0G z_cMT(R_DG#ukR^08u8X2fghSCP_}qVSKRM~Al5z|0C!_t>3;&IUrye%9~b{C`nj(e z+qina4VJ9)eVDO5pelORq&SJM4uHwKebQ_?LJQ>ij@pEZJnqzm=3DYDz(S7*s&GF& zW@0kQTck^5E|gW2H$`-?dafe;o{!_DQlOwId57vrsV1iUhtu2FUnmv5_D3TC z$=GBYu(|4t;u~iHg6ObmK`fC+XLe<-lWd@sjRT|J)M5R z?&go@e*{ A=Kufz diff --git a/docs/design.md b/docs/design.md index 62ea14e6..6bcf7004 100644 --- a/docs/design.md +++ b/docs/design.md @@ -1,3 +1,8 @@ +[This document was written in 2020, and the contents are outdated. +Specifically, we no longer believe that object preloading is a good +idea. That being said, most of the points in this document still hold +even today. Therefore, I'll keep this document as-is.] + ## Design and implementation of mold For the rest of this documentation, I'll explain the design and the diff --git a/docs/mold.1 b/docs/mold.1 index d378f32a..f0d9d1f7 100644 --- a/docs/mold.1 +++ b/docs/mold.1 @@ -1,6 +1,6 @@ .\" generated with Ronn-NG/v0.9.1 .\" http://github.com/apjanke/ronn-ng/tree/0.9.1 -.TH "MOLD" "1" "July 2023" "" +.TH "MOLD" "1" "August 2024" "" .SH "NAME" \fBmold\fR \- a modern linker .SH "SYNOPSIS" @@ -49,6 +49,10 @@ Whether good or bad, you should keep these semantics in mind to understand Unix \fBmold\fR's output is deterministic\. That is, if you pass the same object files and the same command\-line options to the same version of \fBmold\fR, it is guaranteed that \fBmold\fR produces the bit\-by\-bit identical output\. The linker's internal randomness, such as the timing of thread scheduling or iteration orders of hash tables, doesn't affect the output\. .P \fBmold\fR does not have any host\-specific default settings\. This is contrary to the GNU linkers, for which some configurable values, such as system\-dependent library search paths, are hard\-coded\. \fBmold\fR depends only on its command\-line arguments\. +.SH "OPTION NOTATIONS" +Multi\-letter long options may precede either a single dash or double dashes, except for those starting with the letter "o"\. For historical reasons, long options beginning with "o" must precede double dashes\. +.P +For example, you can spell \fB\-\-as\-needed\fR as \fB\-as\-needed\fR, but \fB\-\-omagic\fR must not be spelled as \fB\-omagic\fR\. \fB\-omagic\fR will be interpreted not as \fB\-\-omagic\fR but as \fB\-o magic\fR\. .SH "MOLD\-SPECIFIC OPTIONS" .TP \fB\-\-chroot\fR=\fIdir\fR @@ -63,6 +67,9 @@ Synonym for \fB\-\-color\-diagnostics=auto\fR\. \fB\-\-no\-color\-diagnostics\fR Synonym for \fB\-\-color\-diagnostics=never\fR\. .TP +\fB\-\-detach\fR, `\-\-no\-detach +Permit or do not permit mold to create a debug info file in the background\. +.TP \fB\-\-fork\fR, \fB\-\-no\-fork\fR Spawn a child process and let it do the actual linking\. When linking a large program, the OS kernel can take a few hundred milliseconds to terminate a \fBmold\fR process\. \fB\-\-fork\fR hides that latency\. By default, it does fork\. .TP @@ -74,6 +81,11 @@ Print out dependency information for input files\. .IP Each line of the output for this option shows which file depends on which file to use a specific symbol\. This option is useful for debugging why some object file in a static archive got linked or why some shared library is kept in an output file's dependency list even with \fB\-\-as\-needed\fR\. .TP +\fB\-\-relocatable\-merge\-sections\fR +By default, \fBmold\fR doesn't merge input sections by name when merging input object files into a single output object file for \fB\-r\fR\. For example, \fB\.text\.foo\fR and \fB\.text\.bar\fR aren't merged for \fB\-r\fR even though they are merged into \fB\.text\fR based on the default section merging rules\. +.IP +This option changes the behavior so that \fBmold\fR merges input sections by name by the default section merging rules\. +.TP \fB\-\-repro\fR Archive input files, as well as a text file containing command line options, in a tar file so that you can run \fBmold\fR with the exact same inputs again\. This is useful for reporting a bug with a reproducer\. The output filename is \fBpath/to/output\.tar\fR, where \fBpath/to/output\fR is an output filename specified by \fB\-o\fR\. .TP @@ -85,7 +97,16 @@ This option is useful for finding bugs that depend on the initialization order o By reversing the order of input sections using \fB\-\-reverse\-sections\fR, you can easily test that your program works in the reversed initialization order\. .TP \fB\-\-run\fR \fIcommand\fR \fIarg\fR\|\.\|\.\|\. -Run \fIcommand\fR with \fBmold\fR \fB/usr/bin/ld\fR\. Specifically, \fBmold\fR runs a given command with the \fBLD_PRELOAD\fR environment set to intercept exec(3) family functions and replaces \fBargv[0]\fR with itself if it is \fBld\fR, \fBld\.gold\fR, or \fBld\.lld\fR\. +Run \fIcommand\fR with \fBmold\fR as \fB/usr/bin/ld\fR\. Specifically, \fBmold\fR runs a given command with the \fBLD_PRELOAD\fR environment set to intercept exec(3) family functions and replaces \fBargv[0]\fR with itself if it is \fBld\fR, \fBld\.gold\fR, or \fBld\.lld\fR\. +.TP +\fB\-\-separate\-debug\-file\fR, \fB\-\-separate\-debug\-file\fR=\fIfile\fR +Bundle debug info sections into a separate file instead of embedding them in an output executable or a shared library\. mold creates a debug info file in the background by default, so that you can start running your executable as soon as possible\. +.IP +By default, the debug info file is created in the same directory as is the output file, with the \fB\.dbg\fR file extension\. That filename is embedded into the output file so that \fBgdb\fR can automatically find the debug info file for the output file\. For more info about gdb features related to separate debug files, see \fIhttps://sourceware\.org/gdb/current/onlinedocs/gdb\.html/Separate\-Debug\-Files\.html\fR\. +.IP +mold holds a file lock with flock(2) while creating a debug info file in the background\. +.IP +If you don't want to create a debug info file in the background, pass the \fB\-\-no\-detach\fR option\. .TP \fB\-\-shuffle\-sections\fR, \fB\-\-shuffle\-sections\fR=\fInumber\fR Randomize the output by shuffling the order of input sections before assigning them the offsets in the output file\. If a \fInumber\fR is given, it's used as a seed for the random number generator, so that the linker produces the same output for the same seed\. If no seed is given, a random number is used as a seed\. @@ -94,6 +115,11 @@ This option is useful for benchmarking\. Modern CPUs are sensitive to a program' .IP By running a benchmark multiple times with randomized memory layouts using \fB\-\-shuffle\-sections\fR, you can isolate your program's real performance number from the randomness caused by memory layout changes\. .TP +\fB\-\-spare\-program\-headers\fR=\fInumber\fR +Append the given number of \fBPT_NULL\fR entries to the end of the program header, so that post\-link processing tools can easily add new segments by overwriting the null entries\. +.IP +Note that ELF requires all \fBPT_LOAD\fR segments to be sorted by \fBp_vaddr\fR\. Therefore, if you add a new LOAD segment, you may need to sort the entire program header\. +.TP \fB\-\-stats\fR Print input statistics\. .TP @@ -105,6 +131,17 @@ Use multiple threads\. By default, \fBmold\fR uses as many threads as the number .TP \fB\-\-quick\-exit\fR, \fB\-\-no\-quick\-exit\fR Use or do not use \fBquick_exit\fR to exit\. +.TP +\fB\-z rewrite\-endbr\fR, \fB\-z norewrite\-endbr\fR +As a security measure, some CPU instruction sets have recently gained a feature to protect control flow integrity by disallowing indirect branches by default\. If the feature is enabled, the instruction that is executed immediately after an indirect branch must be an branch target marker instruction, or a CPU\-level fault will raise\. The marker instruction is also known as "landing pad" instruction, to which indirect branches can land\. This feature makes ROP attacks harder to conduct\. +.IP +To use the feature, a function whose pointer is taken needs to begin with a landing pad because a function call via a function pointer is compiled to an indirect branch\. On the other hand, if a function is called only directly (i\.e\. referred to only by \fIdirect\fR branch instructions), it doesn't have to begin with it\. +.IP +By default, the compiler always emits a landing pad at the beginning of each global function because it doesn't know whether or not the function's pointer is taken in another translation unit\. As a result, the resulting binary has more attack surface than necessary\. +.IP +If \fB\-\-rewrite\-endbr\fR is given, mold conducts a whole program analysis to identify functions whose addresses are actually taken and rewrites landing pads with no\-ops for non\-address\-taken functions, reducing the attack surface\. +.IP +This feature is currently available only on x86\-64\. .SH "GNU\-COMPATIBLE OPTIONS" .TP \fB\-\-help\fR @@ -168,17 +205,15 @@ Use \fIfile\fR as the output file name instead of the default name \fBa\.out\fR\ \fB\-r\fR, \fB\-\-relocatable\fR Instead of generating an executable or a shared object file, combine input object files to generate another object file that can be used as an input to a linker\. .TP -\fB\-\-relocatable\-merge\-sections\fR -By default, \fBmold\fR doesn't merge input sections by name when merging input object files into a single output object file for \fB\-r\fR\. For example, \fB\.text\.foo\fR and \fB\.text\.bar\fR aren't merged for \fB\-r\fR even though they are merged into \fB\.text\fR according to the default section merging rules\. -.IP -This option changes the behavior so that \fBmold\fR merges input sections by name by the default section merging rules\. -.TP \fB\-s\fR, \fB\-\-strip\-all\fR Omit \fB\.symtab\fR section from the output file\. .TP \fB\-u\fR \fIsymbol\fR, \fB\-\-undefined\fR=\fIsymbol\fR If \fIsymbol\fR remains as an undefined symbol after reading all object files, and if there is a static archive that contains an object file defining \fIsymbol\fR, pull out the object file and link it so that the output file contains a definition of \fIsymbol\fR\. .TP +\fB\-y\fR \fIsymbol\fR, \fB\-\-trace\-symbol\fR=\fIsymbol\fR +Trace references to \fIsymbol\fR\. +.TP \fB\-\-Bdynamic\fR Link against shared libraries\. .TP @@ -191,8 +226,14 @@ When creating a shared library, make global symbols export\-only (i\.e\. do not \fB\-\-Bsymbolic\-functions\fR This option has the same effect as \fB\-\-Bsymbolic\fR but works only for function symbols\. Data symbols remain being both imported and exported\. .TP +\fB\-\-Bsymbolic\-non\-weak\fR +This option has the same effect as \fB\-\-Bsymbolic\fR but works only for non\-weak symbols\. Weak symbols remain being both imported and exported\. +.TP +\fB\-\-Bsymbolic\-non\-weak\-functions\fR +This option has the same effect as \fB\-\-Bsymbolic\fR but works only for non\-weak function symbols\. Data symbols and weak function symbols remain being both imported and exported\. +.TP \fB\-\-Bno\-symbolic\fR -Cancel \fB\-\-Bsymbolic\fR and \fB\-\-Bsymbolic\-functions\fR\. +Cancel \fB\-\-Bsymbolic\fR, \fB\-\-Bsymbolic\-functions\fR, \fB\-\-Bsymbolic\-non\-weak\fR and \fB\-\-Bsymbolic\-non\-weak\-functions\fR\. .TP \fB\-\-Map\fR=\fIfile\fR Write map file to \fIfile\fR\. @@ -209,13 +250,20 @@ Alias for \fB\-\-section\-start=\.text=\fR\fIaddress\fR\. \fB\-\-allow\-multiple\-definition\fR Normally, the linker reports an error if there are more than one definition of a symbol\. This option changes the default behavior so that it doesn't report an error for duplicate definitions and instead use the first definition\. .TP +\fB\-\-allow\-shlib\-undefined\fR, \fB\-\-no\-allow\-shlib\-undefined\fR +Even if mold succeeds in linking a main executable without undefined symbol errors, you may still encounter symbol lookup errors at runtime because the dynamic linker cannot find some symbols in shared libraries in any ELF module\. This occurs because mold ignores undefined symbols in shared libraries by default\. +.IP +If you pass \fB\-\-no\-allow\-shlib\-undefined\fR, mold verifies that undefined symbols in shared libraries given to the linker can be resolved at link\-time\. In other words, this converts the runtime error to a link\-time error\. +.IP +Note that you need to pass all shared libraries, including indirectly dependent ones, to the linker as arguments for \fB\-l\fR\. If a shared library depends on a library that's not passed to the linker, the verification will be skipped for that file\. +.TP \fB\-\-as\-needed\fR, \fB\-\-no\-as\-needed\fR By default, shared libraries given to the linker are unconditionally added to the list of required libraries in an output file\. However, shared libraries after \fB\-\-as\-needed\fR are added to the list only when at least one symbol is actually used by the output file\. In other words, shared libraries after \fB\-\-as\-needed\fR are not added to the list of needed libraries if they are not needed by a program\. .IP The \fB\-\-no\-as\-needed\fR option restores the default behavior for subsequent files\. .TP -\fB\-\-build\-id\fR=[ \fBmd5\fR | \fBsha1\fR | \fBsha256\fR | \fBuuid\fR | \fB0x\fR\fIhexstring\fR | \fBnone\fR ] -Create a \fB\.note\.gnu\.build\-id\fR section containing a byte string to uniquely identify an output file\. \fBsha256\fR compute a 256\-bit cryptographic hash of an output file and set it to build\-id\. \fBmd5\fR and \fBsha1\fR compute the same hash but truncate it to 128 and 160 bits, respectively, before setting it to build\-id\. \fBuuid\fR sets a random 128\-bit UUID\. \fB0x\fR\fIhexstring\fR sets \fIhexstring\fR\. +\fB\-\-build\-id\fR=[ \fBmd5\fR | \fBsha1\fR | \fBsha256\fR | \fBfast\fR | \fBuuid\fR | \fB0x\fR\fIhexstring\fR | \fBnone\fR ] +Create a \fB\.note\.gnu\.build\-id\fR section containing a byte string to uniquely identify an output file\. \fBsha256\fR compute a 256\-bit cryptographic hash of an output file and set it to build\-id\. \fBmd5\fR and \fBsha1\fR compute the same hash but truncate it to 128 and 160 bits, respectively, before setting it to build\-id\. \fBuuid\fR sets a random 128\-bit UUID\. \fB0x\fR\fIhexstring\fR sets \fIhexstring\fR\. \fBfast\fR is a synonym for \fBsha256\fR\. .TP \fB\-\-build\-id\fR Synonym for \fB\-\-build\-id=sha256\fR\. @@ -254,10 +302,8 @@ The \fB\-\-emit\-relocs\fR instructs the linker to leave relocation sections in \fB\-\-enable\-new\-dtags\fR, \fB\-\-disable\-new\-dtags\fR By default, \fBmold\fR emits \fBDT_RUNPATH\fR for \fB\-\-rpath\fR\. If you pass \fB\-\-disable\-new\-dtags\fR, \fBmold\fR emits \fBDT_RPATH\fR for \fB\-\-rpath\fR instead\. .TP -\fB\-\-execute\-only\fR -Traditionally, most processors require both executable and readable bits to 1 to make the page executable, which allows machine code to be read as data at runtime\. This is actually what an attacker often does after gaining a limited control of a process to find pieces of machine code they can use to gain the full control of the process\. As a mitigation, some recent processors allows "execute\-only" pages\. If a page is execute\-only, you can call a function there as long as you know its address but can't read it as data\. -.IP -This option marks text segments execute\-only\. This option currently works only on some ARM64 processors\. +\fB\-\-execute\-only\fR: + .TP \fB\-\-exclude\-libs\fR=\fIlibraries\fR \|\.\|\.\|\. Mark all symbols in the given \fIlibraries\fR hidden\. @@ -314,9 +360,6 @@ If \fBrelr\fR is specified, all \fBR_*_RELATIVE\fR relocations are put into \fB\ .IP Note that a runtime loader has to support \fB\.relr\.dyn\fR to run executables or shared libraries linked with \fB\-\-pack\-dyn\-relocs=relr\fR\. As of 2022, only ChromeOS, Android and Fuchsia support it\. .TP -\fB\-\-package\-metadata\fR=\fIstring\fR -Embed \fIstring\fR to a \fB\.note\.package\fR section\. This option is intended to be used by a package management command such as rpm(8) to embed metadata regarding a package to each executable file\. -.TP \fB\-\-pie\fR, \fB\-\-pic\-executable\fR, \fB\-\-no\-pie\fR, \fB\-\-no\-pic\-executable\fR Create a position\-independent executable\. .TP @@ -352,7 +395,7 @@ Set \fIaddress\fR to section\. \fIaddress\fR is a hexadecimal number that may st Create a share library\. .TP \fB\-\-spare\-dynamic\-tags\fR=\fInumber\fR -Reserve the given \fInumber\fR of tags in \fB\.dynamic\fR section\. +Append the given number of \fBDT_NULL\fR entries to the end of the \fB\.dynamic\fR section, so that post\-link processing tools can easily add new dynamic tags by overwriting the null entries\. .TP \fB\-\-start\-lib\fR, \fB\-\-end\-lib\fR Handle object files between \fB\-\-start\-lib\fR and \fB\-\-end\-lib\fR as if they were in an archive file\. That means object files between them are linked only when they are needed to resolve undefined symbols\. The options are useful if you want to link object files only when they are needed but want to avoid the overhead of running ar(3)\. @@ -366,6 +409,9 @@ Set target system root directory to \fIdir\fR\. \fB\-\-trace\fR Print name of each input file\. .TP +\fB\-\-undefined\-glob\fR=\fIpattern\fR +Synonym for \fB\-\-undefined\fR, except that \fB\-\-undefined\-glob\fR takes a glob pattern instead of just a single symbol name\. +.TP \fB\-\-undefined\-version\fR, \fB\-\-no\-undefined\-version\fR By default, \fBmold\fR warns on a symbol specified by a version script or by \fB\-\-export\-dynamic\-symbol\fR if it is not defined\. You can silence the warning by \fB\-\-undefined\-version\fR\. .TP @@ -418,8 +464,11 @@ By default, the pages for the stack area (i\.e\. the pages where local variables \fB\-z keep\-text\-section\-prefix\fR, \fB\-z nokeep\-text\-section\-prefix\fR Keep \fB\.text\.hot\fR, \fB\.text\.unknown\fR, \fB\.text\.unlikely\fR, \fB\.text\.startup\fR, and \fB\.text\.exit\fR as separate sections in the final binary instead of merging them as \fB\.text\fR\. .TP +\fB\-z rodynamic\fR +Make the \fB\.dynamic\fR section read\-only\. +.TP \fB\-z relro\fR, \fB\-z norelro\fR -Some sections such as \fB\.dynamic\fR have to be writable only during an executable or a shared library file is being loaded to memory\. Once the dynamic linker finishes its job, such sections won't be mutated by anyone\. As a security mitigation, it is preferred to make such segments read\-only during program execution\. +Some sections such as \fB\.dynamic\fR have to be writable only during a module is being loaded to memory\. Once the dynamic linker finishes its job, such sections won't be mutated by anyone\. As a security mitigation, it is preferred to make such segments read\-only during program execution\. .IP \fB\-z relro\fR puts such sections into a special segment called \fBrelro\fR\. The dynamic linker makes a relro segment read\-only after it finishes its job\. .IP @@ -443,11 +492,16 @@ Report undefined symbols (even with \fB\-\-shared\fR)\. \fB\-z shstk\fR Enforce shadow stack by turning \fBGNU_PROPERTY_X86_FEATURE_1_SHSTK\fR bit in \fB\.note\.gnu\.property\fR output section\. Shadow stack is part of Intel Control\-flow Enforcement Technology (CET), which is available since Tiger Lake (2020)\. .TP +\fB\-z start_stop_visibility\fR=[ \fBhidden\fR | \fBprotected\fR ] +If a section name is valid as a C identifier (i\.e\., it matches \fB/^[_a\-zA\-Z][_a\-zA\-Z0\-9]*$/\fR), mold creates \fB__start_SECNAME\fR and \fB__stop_SECNAME\fR symbols to mark the beginning and end of the section, where \fBSECNAME\fR is the section name\. +.IP +You can make these marker symbols visible from other ELF modules by passing \fB\-z start_stop_visibility=protected\fR\. Default is \fBhidden\fR\. +.TP \fB\-z text\fR, \fB\-z notext\fR, \fB\-z textoff\fR \fBmold\fR by default reports an error if dynamic relocations are created in read\-only sections\. If \fB\-z notext\fR or \fB\-z textoff\fR are given, \fBmold\fR creates such dynamic relocations without reporting an error\. \fB\-z text\fR restores the default behavior\. .TP \fB\-z max\-page\-size\fR=\fInumber\fR -Some CPU ISAs support multiple different memory page sizes\. This option specifies the maximum page size that an output binary can run on\. The default value is 4 KiB for i386, x86\-64, and RISC\-V, and 64 KiB for ARM64\. +Some CPU ISAs support multiple memory page sizes\. This option specifies the maximum page size that an output binary can run on\. In general, binaries built for a larger page size can run on a system with a smaller page size, but not vice versa\. The default value is 4 KiB for i386, x86\-64, and RISC\-V, and 64 KiB for ARM64\. .TP \fB\-z nodefaultlib\fR Make the dynamic loader ignore default search paths\. @@ -470,18 +524,18 @@ Mark DSO to be initialized first at runtime\. \fB\-z interpose\fR Mark object to interpose all DSOs but executable\. .TP -\fB\-(\fR, \fB\-)\fR, \fB\-EL\fR, \fB\-O\fR\fInumber\fR, \fB\-\-allow\-shlib\-undefined\fR, \fB\-\-dc\fR, \fB\-\-dp\fR, \fB\-\-end\-group\fR, \fB\-\-no\-add\-needed\fR, \fB\-\-no\-allow\-shlib\-undefined\fR, \fB\-\-no\-copy\-dt\-needed\-entries\fR, \fB\-\-no\-fatal\-warnings\fR, \fB\-\-nostdlib\fR, \fB\-\-rpath\-link=Ar dir\fR, \fB\-\-sort\-common\fR, \fB\-\-sort\-section\fR, \fB\-\-start\-group\fR, \fB\-\-warn\-constructors\fR, \fB\-\-warn\-once\fR, \fB\-\-fix\-cortex\-a53\-835769\fR, \fB\-\-fix\-cortex\-a53\-843419\fR, \fB\-z combreloc\fR, \fB\-z common\-page\-size\fR, \fB\-z nocombreloc\fR +\fB\-(\fR, \fB\-)\fR, \fB\-EL\fR, \fB\-O\fR\fInumber\fR, \fB\-\-dc\fR, \fB\-\-dp\fR, \fB\-\-end\-group\fR, \fB\-\-no\-add\-needed\fR, \fB\-\-no\-copy\-dt\-needed\-entries\fR, \fB\-\-nostdlib\fR, \fB\-\-rpath\-link=Ar dir\fR, \fB\-\-sort\-common\fR, \fB\-\-sort\-section\fR, \fB\-\-start\-group\fR, \fB\-\-warn\-constructors\fR, \fB\-\-warn\-once\fR, \fB\-\-fix\-cortex\-a53\-835769\fR, \fB\-\-fix\-cortex\-a53\-843419\fR, \fB\-z combreloc\fR, \fB\-z common\-page\-size\fR, \fB\-z nocombreloc\fR Ignored .SH "ENVIRONMENT VARIABLES" .TP \fBMOLD_JOBS\fR -If this variable is set to \fB1\fR, only one process of \fBmold\fR runs actively\. A mold process invoked while another active mold process is running will wait before doing anything until the active process exits\. +If this variable is set to \fB1\fR, only one \fBmold\fR process will run at a time\. If a new mold process is initiated while another is already active, the new process will wait until the active one completes before starting\. .IP -The purpose of this environment variable is to reduce peak memory usage\. Since mold is highly parallelized, there's no point in running it simultaneously\. If you run N instances of mold in parallel, it would take N times more time and N times more memory\. If you run them serially, it would still take N times more to finish, but their peak memory usage is reduced to normal\. +The primary reason for this environment variable is to minimize peak memory usage\. Since mold is designed to operate with high parallelism, running multiple mold instances simultaneously may not be beneficial\. If you execute N instances of mold concurrently, it could require N times the time and N times the memory\. On the other hand, running them one after the other might still take N times longer, but the peak memory usage would be the same as running just a single instance\. .IP -If your build system tends to invoke multiple linker processes simultaneously, you may want to try to set this environment variable to \fB1\fR to see if it could improve overall performance\. +If your build system invokes multiple linker processes simultaneously and some of them often get killed due to out\-of\-memory errors, you might consider setting this environment variable to \fB1\fR to see if it addresses the OOM issue\. .IP -Currently, any value other than 1 is silently ignored\. +Currently, any value other than \fB1\fR is silently ignored\. .TP \fBMOLD_DEBUG\fR If this variable is set to a non\-empty string, \fBmold\fR embeds its command\-line options in the output file's \fB\.comment\fR section\. diff --git a/docs/mold.md b/docs/mold.md index bc65e305..19e7c25b 100644 --- a/docs/mold.md +++ b/docs/mold.md @@ -126,6 +126,16 @@ GNU linkers, for which some configurable values, such as system-dependent library search paths, are hard-coded. `mold` depends only on its command-line arguments. +## OPTION NOTATIONS + +Multi-letter long options may precede either a single dash or double dashes, +except for those starting with the letter "o". For historical reasons, long +options beginning with "o" must precede double dashes. + +For example, you can spell `--as-needed` as `-as-needed`, but `--omagic` must +not be spelled as `-omagic`. `-omagic` will be interpreted not as `--omagic` +but as `-o magic`. + ## MOLD-SPECIFIC OPTIONS * `--chroot`=_dir_: @@ -142,6 +152,9 @@ arguments. * `--no-color-diagnostics`: Synonym for `--color-diagnostics=never`. +* `--detach`, `--no-detach: + Permit or do not permit mold to create a debug info file in the background. + * `--fork`, `--no-fork`: Spawn a child process and let it do the actual linking. When linking a large program, the OS kernel can take a few hundred milliseconds to terminate a @@ -158,6 +171,15 @@ arguments. object file in a static archive got linked or why some shared library is kept in an output file's dependency list even with `--as-needed`. +* `--relocatable-merge-sections`: + By default, `mold` doesn't merge input sections by name when merging input + object files into a single output object file for `-r`. For example, + `.text.foo` and `.text.bar` aren't merged for `-r` even though they are + merged into `.text` based on the default section merging rules. + + This option changes the behavior so that `mold` merges input sections by + name by the default section merging rules. + * `--repro`: Archive input files, as well as a text file containing command line options, in a tar file so that you can run `mold` with the exact same inputs again. @@ -179,10 +201,29 @@ arguments. easily test that your program works in the reversed initialization order. * `--run` _command_ _arg_...: - Run _command_ with `mold` `/usr/bin/ld`. Specifically, `mold` runs a given - command with the `LD_PRELOAD` environment set to intercept exec(3) family - functions and replaces `argv[0]` with itself if it is `ld`, `ld.gold`, or - `ld.lld`. + Run _command_ with `mold` as `/usr/bin/ld`. Specifically, `mold` runs a + given command with the `LD_PRELOAD` environment set to intercept exec(3) + family functions and replaces `argv[0]` with itself if it is `ld`, + `ld.gold`, or `ld.lld`. + +* `--separate-debug-file`, `--separate-debug-file`=_file_: + Bundle debug info sections into a separate file instead of embedding them in + an output executable or a shared library. mold creates a debug info file in + the background by default, so that you can start running your executable as + soon as possible. + + By default, the debug info file is created in the same directory as is the + output file, with the `.dbg` file extension. That filename is embedded into + the output file so that `gdb` can automatically find the debug info file for + the output file. For more info about gdb features related to separate debug + files, see + . + + mold holds a file lock with flock(2) while creating a debug info file in the + background. + + If you don't want to create a debug info file in the background, pass the + `--no-detach` option. * `--shuffle-sections`, `--shuffle-sections`=_number_: Randomize the output by shuffling the order of input sections before @@ -203,6 +244,15 @@ arguments. `--shuffle-sections`, you can isolate your program's real performance number from the randomness caused by memory layout changes. +* `--spare-program-headers`=_number_: + Append the given number of `PT_NULL` entries to the end of the program + header, so that post-link processing tools can easily add new segments by + overwriting the null entries. + + Note that ELF requires all `PT_LOAD` segments to be sorted by `p_vaddr`. + Therefore, if you add a new LOAD segment, you may need to sort the entire + program header. + * `--stats`: Print input statistics. @@ -218,6 +268,33 @@ arguments. * `--quick-exit`, `--no-quick-exit`: Use or do not use `quick_exit` to exit. +* `-z rewrite-endbr`, `-z norewrite-endbr`: + As a security measure, some CPU instruction sets have recently gained a + feature to protect control flow integrity by disallowing indirect branches + by default. If the feature is enabled, the instruction that is executed + immediately after an indirect branch must be an branch target marker + instruction, or a CPU-level fault will raise. The marker instruction is also + known as "landing pad" instruction, to which indirect branches can land. + This feature makes ROP attacks harder to conduct. + + To use the feature, a function whose pointer is taken needs to begin with a + landing pad because a function call via a function pointer is compiled to an + indirect branch. On the other hand, if a function is called only directly + (i.e. referred to only by _direct_ branch instructions), it doesn't have to + begin with it. + + By default, the compiler always emits a landing pad at the beginning of each + global function because it doesn't know whether or not the function's + pointer is taken in another translation unit. As a result, the resulting + binary has more attack surface than necessary. + + If `--rewrite-endbr` is given, mold conducts a whole program analysis + to identify functions whose addresses are actually taken and rewrites + landing pads with no-ops for non-address-taken functions, reducing the + attack surface. + + This feature is currently available only on x86-64. + ## GNU-COMPATIBLE OPTIONS * `--help`: @@ -303,15 +380,6 @@ arguments. object files to generate another object file that can be used as an input to a linker. -* `--relocatable-merge-sections`: - By default, `mold` doesn't merge input sections by name when merging input - object files into a single output object file for `-r`. For example, - `.text.foo` and `.text.bar` aren't merged for `-r` even though they are - merged into `.text` according to the default section merging rules. - - This option changes the behavior so that `mold` merges input sections by - name by the default section merging rules. - * `-s`, `--strip-all`: Omit `.symtab` section from the output file. @@ -321,6 +389,9 @@ arguments. _symbol_, pull out the object file and link it so that the output file contains a definition of _symbol_. +* `-y` _symbol_, `--trace-symbol`=_symbol_: + Trace references to _symbol_. + * `--Bdynamic`: Link against shared libraries. @@ -337,8 +408,18 @@ arguments. This option has the same effect as `--Bsymbolic` but works only for function symbols. Data symbols remain being both imported and exported. +* `--Bsymbolic-non-weak`: + This option has the same effect as `--Bsymbolic` but works only for non-weak + symbols. Weak symbols remain being both imported and exported. + +* `--Bsymbolic-non-weak-functions`: + This option has the same effect as `--Bsymbolic` but works only for non-weak + function symbols. Data symbols and weak function symbols remain being both + imported and exported. + * `--Bno-symbolic`: - Cancel `--Bsymbolic` and `--Bsymbolic-functions`. + Cancel `--Bsymbolic`, `--Bsymbolic-functions`, `--Bsymbolic-non-weak` and + `--Bsymbolic-non-weak-functions`. * `--Map`=_file_: Write map file to _file_. @@ -358,6 +439,23 @@ arguments. report an error for duplicate definitions and instead use the first definition. +* `--allow-shlib-undefined`, `--no-allow-shlib-undefined`: + Even if mold succeeds in linking a main executable without undefined symbol + errors, you may still encounter symbol lookup errors at runtime because the + dynamic linker cannot find some symbols in shared libraries in any ELF + module. This occurs because mold ignores undefined symbols in shared + libraries by default. + + If you pass `--no-allow-shlib-undefined`, mold verifies that undefined + symbols in shared libraries given to the linker can be resolved at + link-time. In other words, this converts the runtime error to a link-time + error. + + Note that you need to pass all shared libraries, including indirectly + dependent ones, to the linker as arguments for `-l`. If a shared library + depends on a library that's not passed to the linker, the verification will + be skipped for that file. + * `--as-needed`, `--no-as-needed`: By default, shared libraries given to the linker are unconditionally added to the list of required libraries in an output file. However, shared @@ -369,13 +467,13 @@ arguments. The `--no-as-needed` option restores the default behavior for subsequent files. -* `--build-id`=[ `md5` | `sha1` | `sha256` | `uuid` | `0x`_hexstring_ | `none` ]: +* `--build-id`=[ `md5` | `sha1` | `sha256` | `fast` | `uuid` | `0x`_hexstring_ | `none` ]: Create a `.note.gnu.build-id` section containing a byte string to uniquely identify an output file. `sha256` compute a 256-bit cryptographic hash of an output file and set it to build-id. `md5` and `sha1` compute the same hash but truncate it to 128 and 160 bits, respectively, before setting it to build-id. `uuid` sets a random 128-bit UUID. `0x`_hexstring_ sets - _hexstring_. + _hexstring_. `fast` is a synonym for `sha256`. * `--build-id`: Synonym for `--build-id=sha256`. @@ -431,17 +529,22 @@ arguments. `--disable-new-dtags`, `mold` emits `DT_RPATH` for `--rpath` instead. * `--execute-only`: - Traditionally, most processors require both executable and readable bits to - 1 to make the page executable, which allows machine code to be read as data - at runtime. This is actually what an attacker often does after gaining a - limited control of a process to find pieces of machine code they can use to - gain the full control of the process. As a mitigation, some recent - processors allows "execute-only" pages. If a page is execute-only, you can - call a function there as long as you know its address but can't read it as - data. - - This option marks text segments execute-only. This option currently works - only on some ARM64 processors. + + Traditionally, setting the executable bit to 1 for a memory page implies + that the page also become readable, which allows machine code to be read + as data at runtime. That is actually what an attacker often does after + gaining a limited control of a process to find pieces of machine code + they can use to gain the full control of the process. As a mitigation, + recent processors including some ARM64 ones allows "execute-only" pages. + If a page is execute-only, you can call a function there as long as you + know its address but can't read it as data. + + This option marks text segments as execute-only by setting just the "X" + bit instead of "RX". Note that on most systems, the absence of the "R" + bit in the text segment serves just as a hint. If you run a program + linked with `--execute-only` on a processor that doesn't support + execute-only pages, your executable will likely still function normally, + but the text segment will remain readable. * `--exclude-libs`=_libraries_ ...: Mark all symbols in the given _libraries_ hidden. @@ -526,11 +629,6 @@ arguments. shared libraries linked with `--pack-dyn-relocs=relr`. As of 2022, only ChromeOS, Android and Fuchsia support it. -* `--package-metadata`=_string_: - Embed _string_ to a `.note.package` section. This option is intended to be - used by a package management command such as rpm(8) to embed metadata - regarding a package to each executable file. - * `--pie`, `--pic-executable`, `--no-pie`, `--no-pic-executable`: Create a position-independent executable. @@ -579,7 +677,9 @@ arguments. Create a share library. * `--spare-dynamic-tags`=_number_: - Reserve the given _number_ of tags in `.dynamic` section. + Append the given number of `DT_NULL` entries to the end of the `.dynamic` + section, so that post-link processing tools can easily add new dynamic tags + by overwriting the null entries. * `--start-lib`, `--end-lib`: Handle object files between `--start-lib` and `--end-lib` as if they were in @@ -597,6 +697,10 @@ arguments. * `--trace`: Print name of each input file. +* `--undefined-glob`=_pattern_: + Synonym for `--undefined`, except that `--undefined-glob` takes a glob + pattern instead of just a single symbol name. + * `--undefined-version`, `--no-undefined-version`: By default, `mold` warns on a symbol specified by a version script or by `--export-dynamic-symbol` if it is not defined. You can silence the warning @@ -684,12 +788,14 @@ arguments. `.text.exit` as separate sections in the final binary instead of merging them as `.text`. +* `-z rodynamic`: + Make the `.dynamic` section read-only. + * `-z relro`, `-z norelro`: - Some sections such as `.dynamic` have to be writable only during an - executable or a shared library file is being loaded to memory. Once the - dynamic linker finishes its job, such sections won't be mutated by anyone. - As a security mitigation, it is preferred to make such segments read-only - during program execution. + Some sections such as `.dynamic` have to be writable only during a module is + being loaded to memory. Once the dynamic linker finishes its job, such + sections won't be mutated by anyone. As a security mitigation, it is + preferred to make such segments read-only during program execution. `-z relro` puts such sections into a special segment called `relro`. The dynamic linker makes a relro segment read-only after it finishes its job. @@ -724,6 +830,15 @@ arguments. Control-flow Enforcement Technology (CET), which is available since Tiger Lake (2020). +* `-z start_stop_visibility`=[ `hidden` | `protected` ]: + If a section name is valid as a C identifier (i.e., it matches + `/^[_a-zA-Z][_a-zA-Z0-9]*$/`), mold creates `__start_SECNAME` and + `__stop_SECNAME` symbols to mark the beginning and end of the section, + where `SECNAME` is the section name. + + You can make these marker symbols visible from other ELF modules by passing + `-z start_stop_visibility=protected`. Default is `hidden`. + * `-z text`, `-z notext`, `-z textoff`: `mold` by default reports an error if dynamic relocations are created in read-only sections. If `-z notext` or `-z textoff` are given, `mold` creates @@ -731,9 +846,11 @@ arguments. default behavior. * `-z max-page-size`=_number_: - Some CPU ISAs support multiple different memory page sizes. This option - specifies the maximum page size that an output binary can run on. The - default value is 4 KiB for i386, x86-64, and RISC-V, and 64 KiB for ARM64. + Some CPU ISAs support multiple memory page sizes. This option specifies the + maximum page size that an output binary can run on. In general, binaries + built for a larger page size can run on a system with a smaller page size, + but not vice versa. The default value is 4 KiB for i386, x86-64, and RISC-V, + and 64 KiB for ARM64. * `-z nodefaultlib`: Make the dynamic loader ignore default search paths. @@ -758,7 +875,7 @@ arguments. * `-z interpose`: Mark object to interpose all DSOs but executable. -* `-(`, `-)`, `-EL`, `-O`_number_, `--allow-shlib-undefined`, `--dc`, `--dp`, `--end-group`, `--no-add-needed`, `--no-allow-shlib-undefined`, `--no-copy-dt-needed-entries`, `--no-fatal-warnings`, `--nostdlib`, `--rpath-link=Ar dir`, `--sort-common`, `--sort-section`, `--start-group`, `--warn-constructors`, `--warn-once`, `--fix-cortex-a53-835769`, `--fix-cortex-a53-843419`, `-z combreloc`, `-z common-page-size`, `-z nocombreloc`: +* `-(`, `-)`, `-EL`, `-O`_number_, `--dc`, `--dp`, `--end-group`, `--no-add-needed`, `--no-copy-dt-needed-entries`, `--nostdlib`, `--rpath-link=Ar dir`, `--sort-common`, `--sort-section`, `--start-group`, `--warn-constructors`, `--warn-once`, `--fix-cortex-a53-835769`, `--fix-cortex-a53-843419`, `-z combreloc`, `-z common-page-size`, `-z nocombreloc`: Ignored ## ENVIRONMENT VARIABLES diff --git a/elf/arch-alpha.cc b/elf/arch-alpha.cc deleted file mode 100644 index d7189434..00000000 --- a/elf/arch-alpha.cc +++ /dev/null @@ -1,330 +0,0 @@ -// Alpha is a 64-bit RISC ISA developed by DEC (Digital Equipment -// Corporation) in the early '90s. It aimed to be an ISA that would last -// 25 years. DEC expected Alpha would become 1000x faster during that time -// span. Since the ISA was developed from scratch for future machines, -// it's 64-bit from the beginning. There's no 32-bit variant. -// -// DEC ported its own Unix (Tru64) to Alpha. Microsoft also ported Windows -// NT to it. But it wasn't a huge commercial success. -// -// DEC was acquired by Compaq in 1997. In the late '90s, Intel and -// Hewlett-Packard were advertising that their upcoming Itanium processor -// would achieve significantly better performance than RISC processors, so -// Compaq decided to discontinue the Alpha processor line to switch to -// Itanium. Itanium resulted in a miserable failure, but it still suceeded -// to wipe out several RISC processors just by promising overly optimistic -// perf numbers. Alpha as an ISA would probably have been fine after 25 -// years since its introduction (which is 1992 + 25 = 2017), but the -// company and its market didn't last that long. -// -// From the linker's point of view, there are a few peculiarities in its -// psABI as shown below: -// -// - Alpha lacks PC-relative memory load/store instructions, so it uses -// register-relative load/store instructions in position-independent -// code. Specifically, GP (which is an alias for $r29) is always -// maintained to refer to .got+0x8000, and global variables' addresses -// are loaded in a GP-relative manner. -// -// - It looks like even function addresses are first loaded to register -// in a GP-relative manner before calling it. We can relax it to -// convert the instruction sequence with a direct branch instruction, -// but by default, object files don't use a direct branch to call a -// function. Therefore, by default, we don't need to create a PLT. -// Any function call is made by first reading its address from GOT and -// jump to the address. - -#include "mold.h" - -namespace mold::elf { - -using E = ALPHA; - -// A 32-bit immediate can be materialized in a register with a "load high" -// and a "load low" instruction sequence. The first instruction sets the -// upper 16 bits in a register, and the second one set the lower 16 -// bits. When doing so, they sign-extend an immediate. Therefore, if the -// 15th bit of an immediate happens to be 1, setting a "low half" value -// negates the upper 16 bit values that has already been set in a -// register. To compensate that, we need to add 0x8000 when setting the -// upper 16 bits. -static u32 hi(u32 val) { - return bits(val + 0x8000, 31, 16); -} - -template <> -void write_plt_header(Context &ctx, u8 *buf) {} - -template <> -void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) {} - -template <> -void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) {} - -template <> -void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, - u64 offset, u64 val) { - u8 *loc = ctx.buf + this->shdr.sh_offset + offset; - - switch (rel.r_type) { - case R_NONE: - break; - case R_ALPHA_SREL32: - *(ul32 *)loc = val - this->shdr.sh_addr - offset; - break; - default: - Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; - } -} - -template <> -void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { - std::span> rels = get_rels(ctx); - - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - - for (i64 i = 0; i < rels.size(); i++) { - const ElfRel &rel = rels[i]; - if (rel.r_type == R_NONE) - continue; - - Symbol &sym = *file.symbols[rel.r_sym]; - u8 *loc = base + rel.r_offset; - - u64 S = sym.get_addr(ctx); - u64 A = rel.r_addend; - u64 P = get_addr() + rel.r_offset; - u64 G = sym.get_got_idx(ctx) * sizeof(Word); - u64 GOT = ctx.got->shdr.sh_addr; - u64 GP = ctx.got->shdr.sh_addr + 0x8000; - - switch (rel.r_type) { - case R_ALPHA_REFQUAD: - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); - break; - case R_ALPHA_GPREL32: - *(ul32 *)loc = S + A - GP; - break; - case R_ALPHA_LITERAL: - if (A) - *(ul16 *)loc = ctx.extra.got->get_addr(sym, A) - GP; - else - *(ul16 *)loc = GOT + G - GP; - break; - case R_ALPHA_BRSGP: - *(ul32 *)loc |= bits(S + A - P - 4, 22, 0); - break; - case R_ALPHA_GPDISP: - *(ul16 *)loc = hi(GP - P); - *(ul16 *)(loc + A) = GP - P; - break; - case R_ALPHA_SREL32: - *(ul32 *)loc = S + A - P; - break; - case R_ALPHA_GPRELHIGH: - *(ul16 *)loc = hi(S + A - GP); - break; - case R_ALPHA_GPRELLOW: - *(ul16 *)loc = S + A - GP; - break; - case R_ALPHA_TLSGD: - *(ul16 *)loc = sym.get_tlsgd_addr(ctx) - GP; - break; - case R_ALPHA_TLSLDM: - *(ul16 *)loc = ctx.got->get_tlsld_addr(ctx) - GP; - break; - case R_ALPHA_DTPRELHI: - *(ul16 *)loc = hi(S + A - ctx.dtp_addr); - break; - case R_ALPHA_DTPRELLO: - *(ul16 *)loc = S + A - ctx.dtp_addr; - break; - case R_ALPHA_GOTTPREL: - *(ul16 *)loc = sym.get_gottp_addr(ctx) + A - GP; - break; - case R_ALPHA_TPRELHI: - *(ul16 *)loc = hi(S + A - ctx.tp_addr); - break; - case R_ALPHA_TPRELLO: - *(ul16 *)loc = S + A - ctx.tp_addr; - break; - case R_ALPHA_LITUSE: - case R_ALPHA_HINT: - break; - default: - unreachable(); - } - } -} - -template <> -void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { - std::span> rels = get_rels(ctx); - - for (i64 i = 0; i < rels.size(); i++) { - const ElfRel &rel = rels[i]; - if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) - continue; - - Symbol &sym = *file.symbols[rel.r_sym]; - u8 *loc = base + rel.r_offset; - - SectionFragment *frag; - i64 frag_addend; - std::tie(frag, frag_addend) = get_fragment(ctx, rel); - - u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); - u64 A = frag ? frag_addend : (i64)rel.r_addend; - - switch (rel.r_type) { - case R_ALPHA_REFLONG: - if (std::optional val = get_tombstone(sym, frag)) - *(ul32 *)loc = *val; - else - *(ul32 *)loc = S + A; - break; - case R_ALPHA_REFQUAD: - if (std::optional val = get_tombstone(sym, frag)) - *(ul64 *)loc = *val; - else - *(ul64 *)loc = S + A; - break; - default: - Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: " - << rel; - } - } -} - -template <> -void InputSection::scan_relocations(Context &ctx) { - assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); - std::span> rels = get_rels(ctx); - - for (i64 i = 0; i < rels.size(); i++) { - const ElfRel &rel = rels[i]; - if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) - continue; - - Symbol &sym = *file.symbols[rel.r_sym]; - - if (sym.is_ifunc()) - Error(ctx) << sym << ": GNU ifunc symbol is not supported on Alpha"; - - switch (rel.r_type) { - case R_ALPHA_REFQUAD: - scan_dyn_absrel(ctx, sym, rel); - break; - case R_ALPHA_LITERAL: - if (rel.r_addend) - ctx.extra.got->add_symbol(sym, rel.r_addend); - else - sym.flags |= NEEDS_GOT; - break; - case R_ALPHA_SREL32: - scan_pcrel(ctx, sym, rel); - break; - case R_ALPHA_BRSGP: - if (sym.is_imported) - sym.flags |= NEEDS_PLT; - break; - case R_ALPHA_TLSGD: - sym.flags |= NEEDS_TLSGD; - break; - case R_ALPHA_TLSLDM: - ctx.needs_tlsld = true; - break; - case R_ALPHA_GOTTPREL: - sym.flags |= NEEDS_GOTTP; - break; - case R_ALPHA_TPRELHI: - case R_ALPHA_TPRELLO: - check_tlsle(ctx, sym, rel); - break; - case R_ALPHA_GPREL32: - case R_ALPHA_LITUSE: - case R_ALPHA_GPDISP: - case R_ALPHA_HINT: - case R_ALPHA_GPRELHIGH: - case R_ALPHA_GPRELLOW: - case R_ALPHA_DTPRELHI: - case R_ALPHA_DTPRELLO: - break; - default: - Fatal(ctx) << *this << ": unknown relocation: " << rel; - } - } -} - -// An R_ALPHA_LITERAL relocation may request the linker to create a GOT -// entry for an external symbol with a non-zero addend. This is an unusual -// request which is not found in any other targets. -// -// Referring an external symbol with a non-zero addend is a bad practice -// because we need to create as many dynamic relocations as the number of -// distinctive addends for the same symbol. -// -// We don't want to mess up the implementation of the common GOT section -// for Alpha. So we create another GOT-like section, .alpha_got. Any GOT -// entry for an R_ALPHA_LITERAL reloc with a non-zero addend is created -// not in .got but in .alpha_got. -// -// Since .alpha_got entries are accessed relative to GP, .alpha_got -// needs to be close enough to .got. It's actually placed next to .got. -void AlphaGotSection::add_symbol(Symbol &sym, i64 addend) { - assert(addend); - std::scoped_lock lock(mu); - entries.push_back({&sym, addend}); -} - -bool operator<(const AlphaGotSection::Entry &a, const AlphaGotSection::Entry &b) { - return std::tuple(a.sym->file->priority, a.sym->sym_idx, a.addend) < - std::tuple(b.sym->file->priority, b.sym->sym_idx, b.addend); -}; - -u64 AlphaGotSection::get_addr(Symbol &sym, i64 addend) { - auto it = std::lower_bound(entries.begin(), entries.end(), Entry{&sym, addend}); - assert(it != entries.end()); - return this->shdr.sh_addr + (it - entries.begin()) * sizeof(Word); -} - -i64 AlphaGotSection::get_reldyn_size(Context &ctx) const { - i64 n = 0; - for (const Entry &e : entries) - if (e.sym->is_imported || (ctx.arg.pic && !e.sym->is_absolute())) - n++; - return n; -} - -void AlphaGotSection::finalize() { - sort(entries); - remove_duplicates(entries); - shdr.sh_size = entries.size() * sizeof(Word); -} - -void AlphaGotSection::copy_buf(Context &ctx) { - ElfRel *dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - reldyn_offset); - - for (i64 i = 0; i < entries.size(); i++) { - Entry &e = entries[i]; - u64 P = this->shdr.sh_addr + sizeof(Word) * i; - ul64 *buf = (ul64 *)(ctx.buf + this->shdr.sh_offset + sizeof(Word) * i); - - if (e.sym->is_imported) { - *buf = ctx.arg.apply_dynamic_relocs ? e.addend : 0; - *dynrel++ = ElfRel(P, E::R_ABS, e.sym->get_dynsym_idx(ctx), e.addend); - } else { - *buf = e.sym->get_addr(ctx) + e.addend; - if (ctx.arg.pic && !e.sym->is_absolute()) - *dynrel++ = ElfRel(P, E::R_RELATIVE, 0, *buf); - } - } -} - -} // namespace mold::elf diff --git a/elf/input-sections.cc b/elf/input-sections.cc deleted file mode 100644 index 70187ef3..00000000 --- a/elf/input-sections.cc +++ /dev/null @@ -1,555 +0,0 @@ -#include "mold.h" - -#include -#include -#include - -namespace mold::elf { - -typedef enum { - NONE, ERROR, COPYREL, DYN_COPYREL, PLT, CPLT, DYN_CPLT, DYNREL, - BASEREL, IFUNC_DYNREL, -} Action; - -template -bool CieRecord::equals(const CieRecord &other) const { - if (get_contents() != other.get_contents()) - return false; - - std::span> x = get_rels(); - std::span> y = other.get_rels(); - if (x.size() != y.size()) - return false; - - for (i64 i = 0; i < x.size(); i++) { - if (x[i].r_offset - input_offset != y[i].r_offset - other.input_offset || - x[i].r_type != y[i].r_type || - file.symbols[x[i].r_sym] != other.file.symbols[y[i].r_sym] || - get_addend(input_section, x[i]) != get_addend(other.input_section, y[i])) - return false; - } - return true; -} - -static i64 to_p2align(u64 alignment) { - if (alignment == 0) - return 0; - return std::countr_zero(alignment); -} - -template -InputSection::InputSection(Context &ctx, ObjectFile &file, i64 shndx) - : file(file), shndx(shndx) { - if (shndx < file.elf_sections.size()) - contents = {(char *)file.mf->data + shdr().sh_offset, (size_t)shdr().sh_size}; - - if (shdr().sh_flags & SHF_COMPRESSED) { - ElfChdr &chdr = *(ElfChdr *)&contents[0]; - sh_size = chdr.ch_size; - p2align = to_p2align(chdr.ch_addralign); - } else { - sh_size = shdr().sh_size; - p2align = to_p2align(shdr().sh_addralign); - } - - // Sections may have been compressed. We usually uncompress them - // directly into the mmap'ed output file, but we want to uncompress - // early for REL-type ELF types to read relocation addends from - // section contents. For RELA-type, we don't need to do this because - // addends are in relocations. - // - // SH-4 stores addends to sections despite being RELA, which is a - // special (and buggy) case. - if constexpr (!E::is_rela || is_sh4) - uncompress(ctx); -} - -template -void InputSection::uncompress(Context &ctx) { - if (!(shdr().sh_flags & SHF_COMPRESSED) || uncompressed) - return; - - u8 *buf = new u8[sh_size]; - uncompress_to(ctx, buf); - contents = std::string_view((char *)buf, sh_size); - ctx.string_pool.emplace_back(buf); - uncompressed = true; -} - -template -void InputSection::uncompress_to(Context &ctx, u8 *buf) { - if (!(shdr().sh_flags & SHF_COMPRESSED) || uncompressed) { - memcpy(buf, contents.data(), contents.size()); - return; - } - - if (contents.size() < sizeof(ElfChdr)) - Fatal(ctx) << *this << ": corrupted compressed section"; - - ElfChdr &hdr = *(ElfChdr *)&contents[0]; - std::string_view data = contents.substr(sizeof(ElfChdr)); - - switch (hdr.ch_type) { - case ELFCOMPRESS_ZLIB: { - unsigned long size = sh_size; - if (::uncompress(buf, &size, (u8 *)data.data(), data.size()) != Z_OK) - Fatal(ctx) << *this << ": uncompress failed"; - assert(size == sh_size); - break; - } - case ELFCOMPRESS_ZSTD: - if (ZSTD_decompress(buf, sh_size, (u8 *)data.data(), data.size()) != sh_size) - Fatal(ctx) << *this << ": ZSTD_decompress failed"; - break; - default: - Fatal(ctx) << *this << ": unsupported compression type: 0x" - << std::hex << hdr.ch_type; - } -} - -template -static void scan_rel(Context &ctx, InputSection &isec, Symbol &sym, - const ElfRel &rel, Action action) { - bool writable = (isec.shdr().sh_flags & SHF_WRITE); - - auto error = [&] { - std::string msg = sym.is_absolute() ? "-fno-PIC" : "-fPIC"; - Error(ctx) << isec << ": " << rel << " relocation at offset 0x" - << std::hex << rel.r_offset << " against symbol `" - << sym << "' can not be used; recompile with " << msg; - }; - - auto check_textrel = [&] { - if (!writable) { - if (ctx.arg.z_text) { - error(); - } else if (ctx.arg.warn_textrel) { - Warn(ctx) << isec << ": relocation against symbol `" << sym - << "' in read-only section"; - } - ctx.has_textrel = true; - } - }; - - auto copyrel = [&] { - assert(sym.is_imported); - if (sym.esym().st_visibility == STV_PROTECTED) { - Error(ctx) << isec - << ": cannot make copy relocation for protected symbol '" << sym - << "', defined in " << *sym.file << "; recompile with -fPIC"; - } - sym.flags |= NEEDS_COPYREL; - }; - - auto dynrel = [&] { - check_textrel(); - isec.file.num_dynrel++; - }; - - switch (action) { - case NONE: - break; - case ERROR: - // Print out the "recompile with -fPIC" error message. - error(); - break; - case COPYREL: - // Create a copy relocation. - if (!ctx.arg.z_copyreloc) - error(); - copyrel(); - break; - case DYN_COPYREL: - // Same as COPYREL but try to avoid creating a copy relocation by - // creating a dynamic relocation instead if the relocation is in - // a writable section. - // - // GHC (Glasgow Haskell Compiler) places a small amount of data in - // .text before each function and access that data with a fixed - // offset. The function breaks if we copy-relocate the data. For such - // programs, we should avoid copy relocations if possible. - // - // Besides GHC, copy relocation is a hacky solution, so if we can - // represent a relocation either with copyrel or dynrel, we prefer - // dynamic relocation. - if (writable || !ctx.arg.z_copyreloc) - dynrel(); - else - copyrel(); - break; - case PLT: - // Create a PLT entry. - sym.flags |= NEEDS_PLT; - break; - case CPLT: - // Create a canonical PLT entry. - sym.flags |= NEEDS_CPLT; - break; - case DYN_CPLT: - // Same as CPLT but try to avoid creating a canonical PLT creating by - // creating a dynamic relocation instead if the relocation is in a - // writable section. The motivation behind it is hte same as DYN_COPYREL. - if (writable) - dynrel(); - else - sym.flags |= NEEDS_CPLT; - break; - case DYNREL: - // Create a dynamic relocation. - dynrel(); - break; - case BASEREL: - // Create a base relocation. - check_textrel(); - if (!isec.is_relr_reloc(ctx, rel)) - isec.file.num_dynrel++; - break; - case IFUNC_DYNREL: - // Create an IRELATIVE relocation for a GNU ifunc symbol. - // - // We usually create an IRELATIVE relocation in .got for each ifunc. - // However, if a statically-initialized pointer is initialized to an - // ifunc's address, we have no choice other than emitting an IRELATIVE - // relocation for each such pointer. - dynrel(); - ctx.num_ifunc_dynrels++; - break; - default: - unreachable(); - } -} - -template -static inline i64 get_output_type(Context &ctx) { - if (ctx.arg.shared) - return 0; - if (ctx.arg.pie) - return 1; - return 2; -} - -template -static inline i64 get_sym_type(Symbol &sym) { - if (sym.is_absolute()) - return 0; - if (!sym.is_imported) - return 1; - if (sym.get_type() != STT_FUNC) - return 2; - return 3; -} - -template -static Action get_pcrel_action(Context &ctx, Symbol &sym) { - // This is for PC-relative relocations (e.g. R_X86_64_PC32). - // We cannot promote them to dynamic relocations because the dynamic - // linker generally does not support PC-relative relocations. - static Action table[3][4] = { - // Absolute Local Imported data Imported code - { ERROR, NONE, ERROR, PLT }, // Shared object - { ERROR, NONE, COPYREL, PLT }, // Position-independent exec - { NONE, NONE, COPYREL, CPLT }, // Position-dependent exec - }; - - return table[get_output_type(ctx)][get_sym_type(sym)]; -} - -template -static Action get_absrel_action(Context &ctx, Symbol &sym) { - // This is a decision table for absolute relocations that is smaller - // than the pointer size (e.g. R_X86_64_32). Since the dynamic linker - // generally does not support dynamic relocations smaller than the - // pointer size, we need to report an error if a relocation cannot be - // resolved at link-time. - static Action table[3][4] = { - // Absolute Local Imported data Imported code - { NONE, ERROR, ERROR, ERROR }, // Shared object - { NONE, ERROR, ERROR, ERROR }, // Position-independent exec - { NONE, NONE, COPYREL, CPLT }, // Position-dependent exec - }; - - return table[get_output_type(ctx)][get_sym_type(sym)]; -} - -template -static Action get_dyn_absrel_action(Context &ctx, Symbol &sym) { - if (sym.is_ifunc()) - return sym.is_pde_ifunc(ctx) ? NONE : IFUNC_DYNREL; - - // This is a decision table for absolute relocations for the pointer - // size data (e.g. R_X86_64_64). Unlike the absrel_table, we can emit - // a dynamic relocation if we cannot resolve an address at link-time. - static Action table[3][4] = { - // Absolute Local Imported data Imported code - { NONE, BASEREL, DYNREL, DYNREL }, // Shared object - { NONE, BASEREL, DYNREL, DYNREL }, // Position-independent exec - { NONE, NONE, DYN_COPYREL, DYN_CPLT }, // Position-dependent exec - }; - - return table[get_output_type(ctx)][get_sym_type(sym)]; -} - -template -static Action get_ppc64_toc_action(Context &ctx, Symbol &sym) { - if (sym.is_ifunc()) - return IFUNC_DYNREL; - - // As a special case, we do not create copy relocations nor canonical - // PLTs for .toc sections. PPC64's .toc is a compiler-generated - // GOT-like section, and no user-generated code directly uses values - // in it. - static Action table[3][4] = { - // Absolute Local Imported data Imported code - { NONE, BASEREL, DYNREL, DYNREL }, // Shared object - { NONE, BASEREL, DYNREL, DYNREL }, // Position-independent exec - { NONE, NONE, DYNREL, DYNREL }, // Position-dependent exec - }; - - return table[get_output_type(ctx)][get_sym_type(sym)]; -} - -template -void InputSection::scan_pcrel(Context &ctx, Symbol &sym, - const ElfRel &rel) { - scan_rel(ctx, *this, sym, rel, get_pcrel_action(ctx, sym)); -} - -template -void InputSection::scan_absrel(Context &ctx, Symbol &sym, - const ElfRel &rel) { - scan_rel(ctx, *this, sym, rel, get_absrel_action(ctx, sym)); -} - -template -void InputSection::scan_dyn_absrel(Context &ctx, Symbol &sym, - const ElfRel &rel) { - scan_rel(ctx, *this, sym, rel, get_dyn_absrel_action(ctx, sym)); -} - -template -void InputSection::scan_toc_rel(Context &ctx, Symbol &sym, - const ElfRel &rel) { - scan_rel(ctx, *this, sym, rel, get_ppc64_toc_action(ctx, sym)); -} - -template -void InputSection::scan_tlsdesc(Context &ctx, Symbol &sym) { - if (ctx.arg.is_static || - (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) { - // Relax TLSDESC to Local Exec. In this case, we directly materialize - // a TP-relative offset, so no dynamic relocation is needed. - // - // TLSDESC relocs must always be relaxed for statically-linked - // executables even if -no-relax is given. It is because a - // statically-linked executable doesn't contain a trampoline - // function needed for TLSDESC. - } else if (ctx.arg.relax && sym.is_tprel_runtime_const(ctx)) { - // In this condition, TP-relative offset of a thread-local variable - // is known at process startup time, so we can relax TLSDESC to the - // code that reads the TP-relative offset from GOT and add TP to it. - sym.flags |= NEEDS_GOTTP; - } else { - // If no relaxation is doable, we simply create a TLSDESC dynamic - // relocation. - sym.flags |= NEEDS_TLSDESC; - } -} - -template -void InputSection::check_tlsle(Context &ctx, Symbol &sym, - const ElfRel &rel) { - if (ctx.arg.shared) - Error(ctx) << *this << ": relocation " << rel << " against `" << sym - << "` can not be used when making a shared object;" - << " recompile with -fPIC"; -} - -template -static void apply_absrel(Context &ctx, InputSection &isec, - Symbol &sym, const ElfRel &rel, u8 *loc, - u64 S, i64 A, u64 P, ElfRel *&dynrel, - Action action) { - bool writable = (isec.shdr().sh_flags & SHF_WRITE); - - auto emit_abs_dynrel = [&] { - *dynrel++ = ElfRel(P, E::R_ABS, sym.get_dynsym_idx(ctx), A); - if (ctx.arg.apply_dynamic_relocs) - *(Word *)loc = A; - }; - - switch (action) { - case COPYREL: - case CPLT: - case NONE: - *(Word *)loc = S + A; - break; - case BASEREL: - if (isec.is_relr_reloc(ctx, rel)) { - *(Word *)loc = S + A; - } else { - *dynrel++ = ElfRel(P, E::R_RELATIVE, 0, S + A); - if (ctx.arg.apply_dynamic_relocs) - *(Word *)loc = S + A; - } - break; - case DYN_COPYREL: - if (writable || !ctx.arg.z_copyreloc) - emit_abs_dynrel(); - else - *(Word *)loc = S + A; - break; - case DYN_CPLT: - if (writable) - emit_abs_dynrel(); - else - *(Word *)loc = S + A; - break; - case DYNREL: - emit_abs_dynrel(); - break; - case IFUNC_DYNREL: - if constexpr (supports_ifunc) { - u64 addr = sym.get_addr(ctx, NO_PLT) + A; - *dynrel++ = ElfRel(P, E::R_IRELATIVE, 0, addr); - if (ctx.arg.apply_dynamic_relocs) - *(Word *)loc = addr; - } else { - unreachable(); - } - break; - default: - unreachable(); - } -} - -template -void InputSection::apply_dyn_absrel(Context &ctx, Symbol &sym, - const ElfRel &rel, u8 *loc, - u64 S, i64 A, u64 P, - ElfRel **dynrel) { - apply_absrel(ctx, *this, sym, rel, loc, S, A, P, *dynrel, - get_dyn_absrel_action(ctx, sym)); -} - -template -void InputSection::apply_toc_rel(Context &ctx, Symbol &sym, - const ElfRel &rel, u8 *loc, - u64 S, i64 A, u64 P, - ElfRel **dynrel) { - apply_absrel(ctx, *this, sym, rel, loc, S, A, P, *dynrel, - get_ppc64_toc_action(ctx, sym)); -} - -template -void InputSection::write_to(Context &ctx, u8 *buf) { - if (shdr().sh_type == SHT_NOBITS || sh_size == 0) - return; - - // Copy data - if constexpr (is_riscv) - copy_contents_riscv(ctx, buf); - else - uncompress_to(ctx, buf); - - // Apply relocations - if (!ctx.arg.relocatable) { - if (shdr().sh_flags & SHF_ALLOC) - apply_reloc_alloc(ctx, buf); - else - apply_reloc_nonalloc(ctx, buf); - - if constexpr (is_x86_64) { - u8 endbr[] = {0xf3, 0x0f, 0x1e, 0xfa}; - u8 nop[] = {0x0f, 0x1f, 0x40, 0x00}; - - // Rewrite the leading endbr instruction with a nop if the section - // is not address-taken. - if (ctx.arg.z_rewrite_endbr && (shdr().sh_flags & SHF_EXECINSTR) && - !address_taken && sh_size >= 4 && memcmp(buf, endbr, 4) == 0) { - memcpy(buf, nop, 4); - } - } - } -} - -// Get the name of a function containin a given offset. -template -std::string_view InputSection::get_func_name(Context &ctx, i64 offset) const { - for (const ElfSym &esym : file.elf_syms) { - if (esym.st_shndx == shndx && esym.st_type == STT_FUNC && - esym.st_value <= offset && offset < esym.st_value + esym.st_size) { - std::string_view name = file.symbol_strtab.data() + esym.st_name; - if (ctx.arg.demangle) - return demangle(name); - return name; - } - } - return ""; -} - -// Test if the symbol a given relocation refers to has already been resolved. -// If not, record that error and returns true. -template -bool InputSection::record_undef_error(Context &ctx, const ElfRel &rel) { - // If a relocation refers to a linker-synthesized symbol for a - // section fragment, it's always been resolved. - if (file.elf_syms.size() <= rel.r_sym) - return false; - - Symbol &sym = *file.symbols[rel.r_sym]; - const ElfSym &esym = file.elf_syms[rel.r_sym]; - - // If a symbol is defined in a comdat group, and the comdat group is - // discarded, the symbol may not have an owner. It is technically an - // violation of the One Definition Rule, so it is a programmer's fault. - if (!sym.file) { - Error(ctx) << *this << ": " << sym << " refers to a discarded COMDAT section" - << " probably due to an ODR violation"; - return true; - } - - auto record = [&] { - std::stringstream ss; - if (std::string_view source = file.get_source_name(); !source.empty()) - ss << ">>> referenced by " << source << "\n"; - else - ss << ">>> referenced by " << *this << "\n"; - - ss << ">>> " << file; - if (std::string_view func = get_func_name(ctx, rel.r_offset); !func.empty()) - ss << ":(" << func << ")"; - - typename decltype(ctx.undef_errors)::accessor acc; - ctx.undef_errors.insert(acc, {sym.name(), {}}); - acc->second.push_back(ss.str()); - }; - - // A non-weak undefined symbol must be promoted to an imported - // symbol or resolved to an defined symbol. Otherwise, it's an - // undefined symbol error. - // - // Every ELF file has an absolute local symbol as its first symbol. - // Referring to that symbol is always valid. - bool is_undef = esym.is_undef() && !esym.is_weak() && sym.sym_idx; - if (!sym.is_imported && is_undef && sym.esym().is_undef()) { - record(); - return true; - } - - // If a protected/hidden undefined symbol is resolved to other .so, - // it's handled as if no symbols were found. - if (sym.file->is_dso && - (sym.visibility == STV_PROTECTED || sym.visibility == STV_HIDDEN)) { - record(); - return true; - } - - return false; -} - -using E = MOLD_TARGET; - -template struct CieRecord; -template class InputSection; - -} // namespace mold::elf diff --git a/elf/linker-script.cc b/elf/linker-script.cc deleted file mode 100644 index 7ad500bb..00000000 --- a/elf/linker-script.cc +++ /dev/null @@ -1,428 +0,0 @@ -// On Linux, /usr/lib/x86_64-linux-gnu/libc.so is not actually -// a shared object file but an ASCII text file containing a linker -// script to include a "real" libc.so file. Therefore, we need to -// support a (very limited) subset of the linker script language. - -#include "mold.h" - -#include -#include - -namespace mold::elf { - -template -static thread_local MappedFile> *current_file; - -template -void read_version_script(Context &ctx, std::span &tok); - -static std::string_view get_line(std::string_view input, const char *pos) { - assert(input.data() <= pos); - assert(pos < input.data() + input.size()); - - i64 start = input.rfind('\n', pos - input.data()); - if (start == input.npos) - start = 0; - else - start++; - - i64 end = input.find('\n', pos - input.data()); - if (end == input.npos) - end = input.size(); - - return input.substr(start, end - start); -} - -template -class SyntaxError { -public: - SyntaxError(Context &ctx, std::string_view errpos) : out(ctx) { - std::string_view contents = current_file->get_contents(); - std::string_view line = get_line(contents, errpos.data()); - - i64 lineno = 1; - for (i64 i = 0; contents.data() + i < line.data(); i++) - if (contents[i] == '\n') - lineno++; - - i64 column = errpos.data() - line.data(); - - std::stringstream ss; - ss << current_file->name << ":" << lineno << ": "; - i64 indent = (i64)ss.tellp() + strlen("mold: "); - ss << line << "\n" << std::setw(indent + column) << " " << "^ "; - out << ss.str(); - } - - template SyntaxError &operator<<(T &&val) { - out << std::forward(val); - return *this; - } - - [[noreturn]] ~SyntaxError() = default; - - Fatal> out; -}; - -template -static std::vector -tokenize(Context &ctx, std::string_view input) { - std::vector vec; - while (!input.empty()) { - if (isspace(input[0])) { - input = input.substr(1); - continue; - } - - if (input.starts_with("/*")) { - i64 pos = input.find("*/", 2); - if (pos == std::string_view::npos) - SyntaxError(ctx, input) << "unclosed comment"; - input = input.substr(pos + 2); - continue; - } - - if (input[0] == '#') { - i64 pos = input.find("\n", 1); - if (pos == std::string_view::npos) - break; - input = input.substr(pos + 1); - continue; - } - - if (input[0] == '"') { - i64 pos = input.find('"', 1); - if (pos == std::string_view::npos) - SyntaxError(ctx, input) << "unclosed string literal"; - vec.push_back(input.substr(0, pos + 1)); - input = input.substr(pos + 1); - continue; - } - - i64 pos = input.find_first_not_of( - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" - "0123456789_.$/\\~=+[]*?-!^:"); - - if (pos == 0) - pos = 1; - else if (pos == input.npos) - pos = input.size(); - - vec.push_back(input.substr(0, pos)); - input = input.substr(pos); - } - return vec; -} - -template -static std::span -skip(Context &ctx, std::span tok, std::string_view str) { - if (tok.empty()) - Fatal(ctx) << current_file->name << ": expected '" << str - << "', but got EOF"; - if (tok[0] != str) - SyntaxError(ctx, tok[0]) << "expected '" << str << "'"; - return tok.subspan(1); -} - -static std::string_view unquote(std::string_view s) { - if (s.size() > 0 && s[0] == '"') { - assert(s[s.size() - 1] == '"'); - return s.substr(1, s.size() - 2); - } - return s; -} - -template -static std::span -read_output_format(Context &ctx, std::span tok) { - tok = skip(ctx, tok, "("); - while (!tok.empty() && tok[0] != ")") - tok = tok.subspan(1); - if (tok.empty()) - Fatal(ctx) << current_file->name << ": expected ')', but got EOF"; - return tok.subspan(1); -} - -template -static bool is_in_sysroot(Context &ctx, std::string path) { - std::string rel = to_abs_path(path) - .lexically_relative(to_abs_path(ctx.arg.sysroot)) - .string(); - return rel != "." && !rel.starts_with("../"); -} - -template -static MappedFile> *resolve_path(Context &ctx, std::string_view tok) { - std::string str(unquote(tok)); - - // GNU ld prepends the sysroot if a pathname starts with '/' and the - // script being processed is in the sysroot. We do the same. - if (str.starts_with('/') && is_in_sysroot(ctx, current_file->name)) - return MappedFile>::must_open(ctx, ctx.arg.sysroot + str); - - if (str.starts_with('=')) { - std::string path; - if (ctx.arg.sysroot.empty()) - path = str.substr(1); - else - path = ctx.arg.sysroot + str.substr(1); - return MappedFile>::must_open(ctx, path); - } - - if (str.starts_with("-l")) - return find_library(ctx, str.substr(2)); - - if (MappedFile> *mf = open_library(ctx, str)) - return mf; - - for (std::string_view dir : ctx.arg.library_paths) { - std::string path = std::string(dir) + "/" + str; - if (MappedFile> *mf = open_library(ctx, path)) - return mf; - } - - SyntaxError(ctx, tok) << "library not found: " << str; -} - -template -static std::span -read_group(Context &ctx, std::span tok) { - tok = skip(ctx, tok, "("); - - while (!tok.empty() && tok[0] != ")") { - if (tok[0] == "AS_NEEDED") { - bool orig = ctx.as_needed; - ctx.as_needed = true; - tok = read_group(ctx, tok.subspan(1)); - ctx.as_needed = orig; - continue; - } - - MappedFile> *mf = resolve_path(ctx, tok[0]); - read_file(ctx, mf); - tok = tok.subspan(1); - } - - if (tok.empty()) - Fatal(ctx) << current_file->name << ": expected ')', but got EOF"; - return tok.subspan(1); -} - -template -void parse_linker_script(Context &ctx, MappedFile> *mf) { - current_file = mf; - - std::vector vec = tokenize(ctx, mf->get_contents()); - std::span tok = vec; - - while (!tok.empty()) { - if (tok[0] == "OUTPUT_FORMAT") { - tok = read_output_format(ctx, tok.subspan(1)); - } else if (tok[0] == "INPUT" || tok[0] == "GROUP") { - tok = read_group(ctx, tok.subspan(1)); - } else if (tok[0] == "VERSION") { - tok = tok.subspan(1); - tok = skip(ctx, tok, "{"); - read_version_script(ctx, tok); - tok = skip(ctx, tok, "}"); - } else if (tok.size() > 3 && tok[1] == "=" && tok[3] == ";") { - ctx.arg.defsyms.emplace_back(get_symbol(ctx, unquote(tok[0])), - get_symbol(ctx, unquote(tok[2]))); - tok = tok.subspan(4); - } else if (tok[0] == ";") { - tok = tok.subspan(1); - } else { - SyntaxError(ctx, tok[0]) << "unknown linker script token"; - } - } -} - -template -std::string_view -get_script_output_type(Context &ctx, MappedFile> *mf) { - current_file = mf; - - std::vector vec = tokenize(ctx, mf->get_contents()); - std::span tok = vec; - - if (tok.size() >= 3 && tok[0] == "OUTPUT_FORMAT" && tok[1] == "(") { - if (tok[2] == "elf64-x86-64") - return X86_64::target_name; - if (tok[2] == "elf32-i386") - return I386::target_name; - } - - if (tok.size() >= 3 && (tok[0] == "INPUT" || tok[0] == "GROUP") && - tok[1] == "(") - if (MappedFile> *mf = - MappedFile>::open(ctx, std::string(unquote(tok[2])))) - return get_machine_type(ctx, mf); - - return ""; -} - -static bool read_label(std::span &tok, - std::string label) { - if (tok.size() >= 1 && tok[0] == label + ":") { - tok = tok.subspan(1); - return true; - } - - if (tok.size() >= 2 && tok[0] == label && tok[1] == ":") { - tok = tok.subspan(2); - return true; - } - return false; -} - -template -static void -read_version_script_commands(Context &ctx, std::span &tok, - std::string_view ver_str, u16 ver_idx, - bool is_global, bool is_cpp) { - while (!tok.empty() && tok[0] != "}") { - if (read_label(tok, "global")) { - is_global = true; - continue; - } - - if (read_label(tok, "local")) { - is_global = false; - continue; - } - - if (tok[0] == "extern") { - tok = tok.subspan(1); - - if (!tok.empty() && tok[0] == "\"C\"") { - tok = tok.subspan(1); - tok = skip(ctx, tok, "{"); - read_version_script_commands( ctx, tok, ver_str, ver_idx, is_global, false); - } else { - tok = skip(ctx, tok, "\"C++\""); - tok = skip(ctx, tok, "{"); - read_version_script_commands(ctx, tok, ver_str, ver_idx, is_global, true); - } - - tok = skip(ctx, tok, "}"); - tok = skip(ctx, tok, ";"); - continue; - } - - if (tok[0] == "*") { - ctx.default_version = (is_global ? ver_idx : (u32)VER_NDX_LOCAL); - } else if (is_global) { - ctx.version_patterns.push_back({unquote(tok[0]), current_file->name, - ver_str, ver_idx, is_cpp}); - } else { - ctx.version_patterns.push_back({unquote(tok[0]), current_file->name, - ver_str, VER_NDX_LOCAL, is_cpp}); - } - - tok = tok.subspan(1); - - if (!tok.empty() && tok[0] == "}") - return; - tok = skip(ctx, tok, ";"); - } -} - -template -void read_version_script(Context &ctx, std::span &tok) { - u16 next_ver = VER_NDX_LAST_RESERVED + ctx.arg.version_definitions.size() + 1; - - while (!tok.empty() && tok[0] != "}") { - std::string_view ver_str; - u16 ver_idx; - - if (tok[0] == "{") { - ver_str = "global"; - ver_idx = VER_NDX_GLOBAL; - } else { - ver_str = tok[0]; - ver_idx = next_ver++; - ctx.arg.version_definitions.push_back(std::string(tok[0])); - tok = tok.subspan(1); - } - - tok = skip(ctx, tok, "{"); - read_version_script_commands(ctx, tok, ver_str, ver_idx, true, false); - tok = skip(ctx, tok, "}"); - if (!tok.empty() && tok[0] != ";") - tok = tok.subspan(1); - tok = skip(ctx, tok, ";"); - } -} - -template -void parse_version_script(Context &ctx, MappedFile> *mf) { - current_file = mf; - std::vector vec = tokenize(ctx, mf->get_contents()); - std::span tok = vec; - read_version_script(ctx, tok); - if (!tok.empty()) - SyntaxError(ctx, tok[0]) << "trailing garbage token"; -} - -template -void read_dynamic_list_commands(Context &ctx, - std::vector &result, - std::span &tok, - bool is_cpp) { - while (!tok.empty() && tok[0] != "}") { - if (tok[0] == "extern") { - tok = tok.subspan(1); - - if (!tok.empty() && tok[0] == "\"C\"") { - tok = tok.subspan(1); - tok = skip(ctx, tok, "{"); - read_dynamic_list_commands(ctx, result, tok, false); - } else { - tok = skip(ctx, tok, "\"C++\""); - tok = skip(ctx, tok, "{"); - read_dynamic_list_commands(ctx, result, tok, true); - } - - tok = skip(ctx, tok, "}"); - tok = skip(ctx, tok, ";"); - continue; - } - - result.push_back({unquote(tok[0]), "", is_cpp}); - tok = skip(ctx, tok.subspan(1), ";"); - } -} - -template -std::vector -parse_dynamic_list(Context &ctx, std::string_view path) { - std::string_view contents = - MappedFile>::must_open(ctx, std::string(path))->get_contents(); - std::vector vec = tokenize(ctx, contents); - std::span tok = vec; - std::vector result; - - tok = skip(ctx, tok, "{"); - read_dynamic_list_commands(ctx, result, tok, false); - tok = skip(ctx, tok, "}"); - tok = skip(ctx, tok, ";"); - - if (!tok.empty()) - SyntaxError(ctx, tok[0]) << "trailing garbage token"; - - for (DynamicPattern &p : result) - p.source = path; - - return result; -} - -using E = MOLD_TARGET; - -template void parse_linker_script(Context &, MappedFile> *); -template std::string_view get_script_output_type(Context &, MappedFile> *); -template void parse_version_script(Context &, MappedFile> *); -template std::vector parse_dynamic_list(Context &, std::string_view); - - -} // namespace mold::elf diff --git a/elf/lto.cc b/elf/lto.cc deleted file mode 100644 index 81505529..00000000 --- a/elf/lto.cc +++ /dev/null @@ -1,5 +0,0 @@ -#ifdef _WIN32 -# include "lto-win32.cc" -#else -# include "lto-unix.cc" -#endif diff --git a/install-build-deps.sh b/install-build-deps.sh index 461f0077..4ef3bac1 100755 --- a/install-build-deps.sh +++ b/install-build-deps.sh @@ -1,54 +1,52 @@ #!/bin/sh +# This script installs binary packages needed to build mold. +# Feel free to send me a PR if your OS is not on this list. + set -e . /etc/os-release set -x -# The first line for each distro installs a build dependency. -# The second line installs extra packages for unittests. -# -# Feel free to send me a PR if your OS is not on this list. - case "$ID-$VERSION_ID" in ubuntu-20.* | pop-20.*) apt-get update apt-get install -y cmake gcc g++ g++-10 - apt-get install -y file ;; ubuntu-* | pop-* | linuxmint-* | debian-* | raspbian-*) apt-get update apt-get install -y cmake gcc g++ - apt-get install -y file ;; -fedora-*) - dnf install -y gcc-g++ cmake - dnf install -y glibc-static file libstdc++-static diffutils util-linux +fedora-* | amzn-* | rhel-*) + dnf install -y gcc-g++ cmake glibc-static libstdc++-static diffutils util-linux ;; -opensuse-leap-*) - zypper install -y make cmake gcc-c++ gcc11-c++ - zypper install -y glibc-devel-static tar diffutils util-linux - ;; -opensuse-tumbleweed-*) - zypper install -y make cmake gcc-c++ - zypper install -y glibc-devel-static tar diffutils util-linux +opensuse-*) + zypper install -y make cmake gcc-c++ glibc-devel-static tar diffutils util-linux ;; gentoo-*) emerge-webrsync - emerge dev-util/cmake + FEATURES='getbinpkg binpkg-request-signature' emerge dev-build/cmake ;; -arch-* | archarm-* | artix-*) - pacman -Sy - pacman -S --needed --noconfirm base-devel cmake util-linux +arch-* | archarm-* | artix-* | endeavouros-*) + pacman -Sy --needed --noconfirm base-devel cmake util-linux ;; void-*) - xbps-install -Sy xbps - xbps-install -Sy bash make cmake gcc - xbps-install -Sy tar diffutils util-linux + xbps-install -Sy xbps bash make cmake gcc tar diffutils util-linux ;; alpine-*) apk update apk add bash make linux-headers cmake gcc g++ ;; +clear-linux-*) + swupd update + swupd bundle-add c-basic diffutils + ;; +almalinux-*) + dnf install -y gcc-toolset-13-gcc-c++ gcc-toolset-13-libstdc++-devel cmake diffutils + ;; +freebsd-*) + pkg update + pkg install -y cmake bash binutils gcc + ;; *) echo "Error: don't know anything about build dependencies on $ID-$VERSION_ID" exit 1 diff --git a/install-cross-tools.sh b/install-cross-tools.sh index 86dc10dd..fcac8ef8 100755 --- a/install-cross-tools.sh +++ b/install-cross-tools.sh @@ -11,7 +11,7 @@ set -x case "$ID-$VERSION_ID" in ubuntu-* | pop-* | linuxmint-* | debian-* | raspbian-*) - apt-get install -y qemu-user {gcc,g++}-{i686,aarch64,riscv64,powerpc,powerpc64,powerpc64le,s390x,sparc64,m68k,sh4,alpha}-linux-gnu {gcc,g++}-arm-linux-gnueabihf + apt-get install -y qemu-user {gcc,g++}-{i686,aarch64,riscv64,powerpc,powerpc64,powerpc64le,s390x,sparc64,m68k,sh4}-linux-gnu {gcc,g++}-arm-linux-gnueabihf ;; *) echo "Error: don't know anything about build dependencies on $ID-$VERSION_ID" diff --git a/common/archive-file.h b/lib/archive-file.h similarity index 92% rename from common/archive-file.h rename to lib/archive-file.h index e2e28525..9ce4a030 100644 --- a/common/archive-file.h +++ b/lib/archive-file.h @@ -26,7 +26,6 @@ #pragma once #include "common.h" -#include "filetype.h" namespace mold { @@ -76,7 +75,7 @@ struct ArHdr { } }; -template +template std::vector read_thin_archive_members(Context &ctx, MappedFile *mf) { u8 *begin = mf->data; @@ -117,14 +116,14 @@ read_thin_archive_members(Context &ctx, MappedFile *mf) { std::string path = name.starts_with('/') ? name : (filepath(mf->name).parent_path() / name).string(); - vec.push_back(MappedFile::must_open(ctx, path)); + vec.push_back(must_open_file(ctx, path)); vec.back()->thin_parent = mf; data = body; } return vec; } -template +template std::vector read_fat_archive_members(Context &ctx, MappedFile *mf) { u8 *begin = mf->data; u8 *data = begin + 8; @@ -162,16 +161,13 @@ std::vector read_fat_archive_members(Context &ctx, MappedFile *mf) return vec; } -template +template std::vector read_archive_members(Context &ctx, MappedFile *mf) { - switch (get_file_type(ctx, mf)) { - case FileType::AR: + std::string_view str = mf->get_contents(); + if (str.starts_with("!\n")) return read_fat_archive_members(ctx, mf); - case FileType::THIN_AR: - return read_thin_archive_members(ctx, mf); - default: - unreachable(); - } + assert(str.starts_with("!\n")); + return read_thin_archive_members(ctx, mf); } } // namespace mold diff --git a/common/common.h b/lib/common.h similarity index 66% rename from common/common.h rename to lib/common.h index adc1d7f9..d915c97e 100644 --- a/common/common.h +++ b/lib/common.h @@ -42,23 +42,6 @@ # define unreachable() assert(0 && "unreachable") #endif -// __builtin_assume() is supported only by clang, and [[assume]] is -// available only in C++23, so we use this macro when giving a hint to -// the compiler's optimizer what's true. -#define ASSUME(x) do { if (!(x)) __builtin_unreachable(); } while (0) - -// This is an assert() that is enabled even in the release build. -#define ASSERT(x) \ - do { \ - if (!(x)) { \ - std::cerr << "Assertion failed: (" << #x \ - << "), function " << __FUNCTION__ \ - << ", file " << __FILE__ \ - << ", line " << __LINE__ << ".\n"; \ - std::abort(); \ - } \ - } while (0) - inline uint64_t hash_string(std::string_view str) { return XXH3_64bits(str.data(), str.size()); } @@ -79,25 +62,19 @@ namespace mold { using namespace std::literals::string_literals; using namespace std::literals::string_view_literals; -template class OutputFile; - inline char *output_tmpfile; -inline thread_local bool opt_demangle; inline u8 *output_buffer_start = nullptr; inline u8 *output_buffer_end = nullptr; -inline std::string mold_version; -extern std::string mold_version_string; extern std::string mold_git_hash; std::string errno_string(); std::string get_self_path(); void cleanup(); void install_signal_handler(); -i64 get_default_thread_count(); -static u64 combine_hash(u64 a, u64 b) { +inline u64 combine_hash(u64 a, u64 b) { return a ^ (b + 0x9e3779b9 + (a << 6) + (a >> 2)); } @@ -105,103 +82,126 @@ static u64 combine_hash(u64 a, u64 b) { // Error output // -template -class SyncOut { +// Some C++ stdlibs don't support std::osyncstream even though +// it's is in the C++20 standard. So we implement it ourselves. +class SyncStream { public: - SyncOut(Context &ctx, std::ostream *out = &std::cout) : out(out) { - opt_demangle = ctx.arg.demangle; - } + SyncStream(std::ostream &out) : out(out) {} - ~SyncOut() { - if (out) { - std::scoped_lock lock(mu); - *out << ss.str() << "\n"; - } + ~SyncStream() { + emit(); } - template SyncOut &operator<<(T &&val) { - if (out) - ss << std::forward(val); + template SyncStream &operator<<(T &&val) { + ss << std::forward(val); return *this; } - static inline std::mutex mu; + void emit() { + if (emitted) + return; + + std::scoped_lock lock(mu); + out << ss.str() << '\n'; + emitted = true; + } private: - std::ostream *out; + std::ostream &out; std::stringstream ss; + bool emitted = false; + static inline std::mutex mu; }; template -static std::string add_color(Context &ctx, std::string msg) { - if (ctx.arg.color_diagnostics) - return "mold: \033[0;1;31m" + msg + ":\033[0m "; - return "mold: " + msg + ": "; -} +class Out { +public: + Out(Context &ctx) {} + + template Out &operator<<(T &&val) { + out << std::forward(val); + return *this; + } + +private: + SyncStream out{std::cout}; +}; + +static std::string_view fatal_mono = "mold: fatal: "; +static std::string_view fatal_color = "mold: \033[0;1;31mfatal:\033[0m "; +static std::string_view error_mono = "mold: error: "; +static std::string_view error_color = "mold: \033[0;1;31merror:\033[0m "; +static std::string_view warning_mono = "mold: warning: "; +static std::string_view warning_color = "mold: \033[0;1;35mwarning:\033[0m "; template class Fatal { public: - Fatal(Context &ctx) : out(ctx, &std::cerr) { - out << add_color(ctx, "fatal"); + Fatal(Context &ctx) { + out << (ctx.arg.color_diagnostics ? fatal_color : fatal_mono); } [[noreturn]] ~Fatal() { - out.~SyncOut(); + out.emit(); cleanup(); _exit(1); } - template Fatal &operator<<(T &&val) { + template Fatal &operator<<(T &&val) { out << std::forward(val); return *this; } private: - SyncOut out; + SyncStream out{std::cerr}; }; template class Error { public: - Error(Context &ctx) : out(ctx, &std::cerr) { + Error(Context &ctx) { if (ctx.arg.noinhibit_exec) { - out << add_color(ctx, "warning"); + out << (ctx.arg.color_diagnostics ? warning_color : warning_mono); } else { - out << add_color(ctx, "error"); + out << (ctx.arg.color_diagnostics ? error_color : error_mono); ctx.has_error = true; } } - template Error &operator<<(T &&val) { + template Error &operator<<(T &&val) { out << std::forward(val); return *this; } private: - SyncOut out; + SyncStream out{std::cerr}; }; template class Warn { public: - Warn(Context &ctx) - : out(ctx, ctx.arg.suppress_warnings ? nullptr : &std::cerr) { + Warn(Context &ctx) { + if (ctx.arg.suppress_warnings) + return; + + out.emplace(std::cerr); + if (ctx.arg.fatal_warnings) { - out << add_color(ctx, "error"); + *out << (ctx.arg.color_diagnostics ? error_color : error_mono); ctx.has_error = true; } else { - out << add_color(ctx, "warning"); + *out << (ctx.arg.color_diagnostics ? warning_color : warning_mono); } } - template Warn &operator<<(T &&val) { - out << std::forward(val); + template Warn &operator<<(T &&val) { + if (out) + *out << std::forward(val); return *this; } private: - SyncOut out; + std::optional out; }; // @@ -216,7 +216,7 @@ struct Atomic : std::atomic { using std::atomic::atomic; - Atomic(const Atomic &other) { store(other.load()); } + Atomic(const Atomic &other) : std::atomic(other.load()) {} Atomic &operator=(const Atomic &other) { store(other.load()); @@ -403,9 +403,9 @@ void update_maximum(std::atomic &atomic, u64 new_val, Compare cmp = {}) { std::memory_order_relaxed)); } -template -inline void append(std::vector &vec1, std::vector vec2) { - vec1.insert(vec1.end(), vec2.begin(), vec2.end()); +template +inline void append(std::vector &x, const auto &y) { + x.insert(x.end(), y.begin(), y.end()); } template @@ -421,13 +421,11 @@ inline std::vector flatten(std::vector> &vec) { return ret; } -template -inline void sort(T &vec) { +inline void sort(auto &vec) { std::stable_sort(vec.begin(), vec.end()); } -template -inline void sort(T &vec, U less) { +inline void sort(auto &vec, auto less) { std::stable_sort(vec.begin(), vec.end(), less); } @@ -443,10 +441,9 @@ inline i64 write_string(void *buf, std::string_view str) { } template -inline i64 write_vector(void *buf, const std::vector &vec) { - i64 sz = vec.size() * sizeof(T); - memcpy(buf, vec.data(), sz); - return sz; +inline void write_vector(void *buf, const std::vector &vec) { + if (!vec.empty()) + memcpy(buf, vec.data(), vec.size() * sizeof(T)); } inline void encode_uleb(std::vector &vec, u64 val) { @@ -512,9 +509,6 @@ inline u64 read_uleb(std::string_view str) { } inline i64 uleb_size(u64 val) { -#if __GNUC__ -#pragma GCC unroll 8 -#endif for (int i = 1; i < 9; i++) if (val < (1LL << (7 * i))) return i; @@ -529,21 +523,14 @@ inline void overwrite_uleb(u8 *loc, u64 val) { *loc = val & 0b0111'1111; } -template -std::string_view save_string(Context &ctx, const std::string &str) { - u8 *buf = new u8[str.size() + 1]; - memcpy(buf, str.data(), str.size()); - buf[str.size()] = '\0'; - ctx.string_pool.push_back(std::unique_ptr(buf)); - return {(char *)buf, str.size()}; -} - -inline bool remove_prefix(std::string_view &s, std::string_view prefix) { - if (s.starts_with(prefix)) { - s = s.substr(prefix.size()); - return true; - } - return false; +static inline void pause() { +#if defined(__x86_64__) + asm volatile("pause"); +#elif defined(__aarch64__) + asm volatile("yield"); +#elif defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A__) + asm volatile("yield"); +#endif } // @@ -555,17 +542,26 @@ inline bool remove_prefix(std::string_view &s, std::string_view prefix) { // So you need to give a correct estimation of the final size before // using it. We use this hash map to uniquify pieces of data in // mergeable sections. +// +// We've implemented this ourselves because the performance of +// conrurent hash map is critical for our linker. template class ConcurrentMap { public: - ConcurrentMap() {} + ConcurrentMap() = default; ConcurrentMap(i64 nbuckets) { resize(nbuckets); } ~ConcurrentMap() { - free(entries); + if (entries) { +#ifdef _WIN32 + _aligned_free(entries); +#else + munmap(entries, sizeof(Entry) * nbuckets); +#endif + } } // In order to avoid unnecessary cache-line false sharing, we want @@ -573,40 +569,51 @@ class ConcurrentMap { // power-of-two address. struct alignas(32) Entry { Atomic key; - T value; u32 keylen; + T value; }; void resize(i64 nbuckets) { + assert(!entries); this->nbuckets = std::max(MIN_NBUCKETS, bit_ceil(nbuckets)); + i64 bufsize = sizeof(Entry) * this->nbuckets; - i64 sz = sizeof(Entry) * this->nbuckets; - free(entries); - -#if _WIN32 - // Even though std::aligned_alloc is defined in C++17, MSVC doesn't - // seem to provide that function. - entries = (Entry *)_aligned_malloc(sz, alignof(Entry)); + // Allocate a zero-initialized buffer. We use mmap() if available + // because it's faster than malloc() and memset(). +#ifdef _WIN32 + entries = (Entry *)_aligned_malloc(bufsize, alignof(Entry)); + memset((void *)entries, 0, bufsize); #else - entries = (Entry *)std::aligned_alloc(alignof(Entry), sz); + entries = (Entry *)mmap(nullptr, bufsize, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); #endif - - memset(entries, 0, sz); } - std::pair insert(std::string_view key, u64 hash, const T &val) { + std::pair insert(std::string_view key, u32 hash, const T &val) { assert(has_single_bit(nbuckets)); - i64 idx = hash & (nbuckets - 1); - i64 retry = 0; + i64 begin = hash & (nbuckets - 1); + u64 mask = nbuckets / NUM_SHARDS - 1; - while (retry < MAX_RETRY) { + for (i64 i = 0; i < MAX_RETRY; i++) { + i64 idx = (begin & ~mask) | ((begin + i) & mask); Entry &ent = entries[idx]; + + // It seems avoiding compare-and-swap is faster overall at least + // on my Zen4 machine, so do it. + if (const char *ptr = ent.key.load(std::memory_order_acquire); + ptr != nullptr && ptr != (char *)-1) { + if (key == std::string_view(ptr, ent.keylen)) + return {&ent.value, false}; + continue; + } + + // Otherwise, use CAS to atomically claim the ownership of the slot. const char *ptr = nullptr; - bool claimed = ent.key.compare_exchange_weak(ptr, (char *)-1, - std::memory_order_acquire); + bool claimed = ent.key.compare_exchange_strong(ptr, (char *)-1, + std::memory_order_acquire); - // If we successfully claimed the ownership of an unused slot, + // If we successfully claimed the ownership of the slot, // copy values to it. if (claimed) { new (&ent.value) T(val); @@ -615,10 +622,6 @@ class ConcurrentMap { return {&ent.value, true}; } - // Loop on a spurious failure. - if (ptr == nullptr) - continue; - // If someone is copying values to the slot, do busy wait. while (ptr == (char *)-1) { pause(); @@ -629,11 +632,6 @@ class ConcurrentMap { // looking for. if (key == std::string_view(ptr, ent.keylen)) return {&ent.value, false}; - - // Otherwise, move on to the next slot. - u64 mask = nbuckets / NUM_SHARDS - 1; - idx = (idx & ~mask) | ((idx + 1) & mask); - retry++; } assert(false && "ConcurrentMap is full"); @@ -701,79 +699,13 @@ class ConcurrentMap { Entry *entries = nullptr; i64 nbuckets = 0; - -private: - static void pause() { -#if defined(__x86_64__) - asm volatile("pause"); -#elif defined(__aarch64__) - asm volatile("yield"); -#endif - } }; // -// output-file.h +// random.cc // -template -class OutputFile { -public: - static std::unique_ptr> - open(Context &ctx, std::string path, i64 filesize, i64 perm); - - virtual void close(Context &ctx) = 0; - virtual ~OutputFile() = default; - - u8 *buf = nullptr; - std::vector buf2; - std::string path; - i64 fd = -1; - i64 filesize = 0; - bool is_mmapped = false; - bool is_unmapped = false; - -protected: - OutputFile(std::string path, i64 filesize, bool is_mmapped) - : path(path), filesize(filesize), is_mmapped(is_mmapped) {} -}; - -template -class MallocOutputFile : public OutputFile { -public: - MallocOutputFile(Context &ctx, std::string path, i64 filesize, i64 perm) - : OutputFile(path, filesize, false), perm(perm) { - this->buf = (u8 *)malloc(filesize); - if (!this->buf) - Fatal(ctx) << "malloc failed"; - } - - ~MallocOutputFile() { - free(this->buf); - } - - void close(Context &ctx) override { - Timer t(ctx, "close_file"); - FILE *fp; - - if (this->path == "-") { - fp = stdout; - } else { - i64 fd = ::open(this->path.c_str(), O_RDWR | O_CREAT, perm); - if (fd == -1) - Fatal(ctx) << "cannot open " << this->path << ": " << errno_string(); - fp = fdopen(fd, "w"); - } - - fwrite(this->buf, this->filesize, 1, fp); - if (!this->buf2.empty()) - fwrite(this->buf2.data(), this->buf2.size(), 1, fp); - fclose(fp); - } - -private: - i64 perm; -}; +void get_random_bytes(u8 *buf, i64 size); // // hyperloglog.cc @@ -781,9 +713,7 @@ class MallocOutputFile : public OutputFile { class HyperLogLog { public: - HyperLogLog() : buckets(NBUCKETS) {} - - void insert(u32 hash) { + void insert(u64 hash) { update_maximum(buckets[hash & (NBUCKETS - 1)], std::countl_zero(hash) + 1); } @@ -798,7 +728,7 @@ class HyperLogLog { static constexpr i64 NBUCKETS = 2048; static constexpr double ALPHA = 0.79402; - std::vector buckets; + Atomic buckets[NBUCKETS]; }; // @@ -832,13 +762,13 @@ class Glob { class MultiGlob { public: - bool add(std::string_view pat, u32 val); + bool add(std::string_view pat, i64 val); bool empty() const { return strings.empty(); } - std::optional find(std::string_view str); + std::optional find(std::string_view str); private: struct TrieNode { - u32 value = -1; + i64 value = -1; TrieNode *suffix_link = nullptr; std::unique_ptr children[256]; }; @@ -846,26 +776,21 @@ class MultiGlob { void compile(); void fix_suffix_links(TrieNode &node); void fix_values(); + i64 find_aho_corasick(std::string_view str); std::vector strings; std::unique_ptr root; - std::vector> globs; + std::vector> globs; std::once_flag once; bool is_compiled = false; + bool prefix_match = false; }; -// -// uuid.cc -// - -std::array get_uuid_v4(); - // // filepath.cc // -template -std::filesystem::path filepath(const T &path) { +std::filesystem::path filepath(const auto &path) { return {path, std::filesystem::path::format::generic_format}; } @@ -877,8 +802,22 @@ std::filesystem::path to_abs_path(std::filesystem::path path); // demangle.cc // -std::string_view demangle(std::string_view name); -std::optional cpp_demangle(std::string_view name); +std::optional demangle_cpp(std::string_view name); +std::optional demangle_rust(std::string_view name); + +// +// jbos.cc +// + +void acquire_global_lock(); +void release_global_lock(); + +// +// crc32.cc +// + +u32 compute_crc32(u32 crc, u8 *buf, i64 len); +std::vector crc32_solve(u32 current, u32 desired); // // compress.cc @@ -928,8 +867,6 @@ class TarWriter { void append(std::string path, std::string_view data); private: - static constexpr i64 BLOCK_SIZE = 512; - TarWriter(FILE *out, std::string basedir) : out(out), basedir(basedir) {} FILE *out = nullptr; @@ -942,16 +879,24 @@ class TarWriter { // MappedFile represents an mmap'ed input file. // mold uses mmap-IO only. -template class MappedFile { public: - static MappedFile *open(Context &ctx, std::string path); - static MappedFile *must_open(Context &ctx, std::string path); - ~MappedFile() { unmap(); } void unmap(); - - MappedFile *slice(Context &ctx, std::string name, u64 start, u64 size); + void close_fd(); + void reopen_fd(const std::string &path); + + template + MappedFile *slice(Context &ctx, std::string name, u64 start, u64 size) { + MappedFile *mf = new MappedFile; + mf->name = name; + mf->data = data + start; + mf->size = size; + mf->parent = this; + + ctx.mf_pool.push_back(std::unique_ptr(mf)); + return mf; + } std::string_view get_contents() { return std::string_view((char *)data, size); @@ -984,98 +929,40 @@ class MappedFile { bool given_fullpath = true; MappedFile *parent = nullptr; MappedFile *thin_parent = nullptr; - int fd = -1; -#ifdef _WIN32 - HANDLE file_handle = INVALID_HANDLE_VALUE; -#endif -}; -template -MappedFile *MappedFile::open(Context &ctx, std::string path) { - if (path.starts_with('/') && !ctx.arg.chroot.empty()) - path = ctx.arg.chroot + "/" + path_clean(path); + // For --dependency-file + bool is_dependency = true; - i64 fd; #ifdef _WIN32 - fd = ::_open(path.c_str(), O_RDONLY); + HANDLE fd = INVALID_HANDLE_VALUE; #else - fd = ::open(path.c_str(), O_RDONLY); + int fd = -1; #endif +}; - if (fd == -1) { - if (errno != ENOENT) - Fatal(ctx) << "opening " << path << " failed: " << errno_string(); - return nullptr; - } - - struct stat st; - if (fstat(fd, &st) == -1) - Fatal(ctx) << path << ": fstat failed: " << errno_string(); - - MappedFile *mf = new MappedFile; - ctx.mf_pool.push_back(std::unique_ptr(mf)); +MappedFile *open_file_impl(const std::string &path, std::string &error); - mf->name = path; - mf->size = st.st_size; +template +MappedFile *open_file(Context &ctx, std::string path) { + if (path.starts_with('/') && !ctx.arg.chroot.empty()) + path = ctx.arg.chroot + "/" + path_clean(path); - if (st.st_size > 0) { -#ifdef _WIN32 - HANDLE handle = CreateFileMapping((HANDLE)_get_osfhandle(fd), - nullptr, PAGE_READWRITE, 0, - st.st_size, nullptr); - if (!handle) - Fatal(ctx) << path << ": CreateFileMapping failed: " << GetLastError(); - mf->file_handle = handle; - mf->data = (u8 *)MapViewOfFile(handle, FILE_MAP_ALL_ACCESS, 0, 0, st.st_size); - if (!mf->data) - Fatal(ctx) << path << ": MapViewOfFile failed: " << GetLastError(); -#else - mf->data = (u8 *)mmap(nullptr, st.st_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE, fd, 0); - if (mf->data == MAP_FAILED) - Fatal(ctx) << path << ": mmap failed: " << errno_string(); -#endif - } + std::string error; + MappedFile *mf = open_file_impl(path, error); + if (!error.empty()) + Fatal(ctx) << error; - close(fd); + if (mf) + ctx.mf_pool.push_back(std::unique_ptr(mf)); return mf; } template -MappedFile * -MappedFile::must_open(Context &ctx, std::string path) { - if (MappedFile *mf = MappedFile::open(ctx, path)) - return mf; - Fatal(ctx) << "cannot open " << path << ": " << errno_string(); -} - -template -MappedFile * -MappedFile::slice(Context &ctx, std::string name, u64 start, u64 size) { - MappedFile *mf = new MappedFile; - mf->name = name; - mf->data = data + start; - mf->size = size; - mf->parent = this; - - ctx.mf_pool.push_back(std::unique_ptr(mf)); +MappedFile *must_open_file(Context &ctx, std::string path) { + MappedFile *mf = open_file(ctx, path); + if (!mf) + Fatal(ctx) << "cannot open " << path << ": " << errno_string(); return mf; } -template -void MappedFile::unmap() { - if (size == 0 || parent || !data) - return; - -#ifdef _WIN32 - UnmapViewOfFile(data); - if (file_handle != INVALID_HANDLE_VALUE) - CloseHandle(file_handle); -#else - munmap(data, size); -#endif - - data = nullptr; -} - } // namespace mold diff --git a/common/compress.cc b/lib/compress.cc similarity index 96% rename from common/compress.cc rename to lib/compress.cc index 59bdc25a..23764c31 100644 --- a/common/compress.cc +++ b/lib/compress.cc @@ -132,7 +132,7 @@ void ZlibCompressor::write_to(u8 *buf) { offsets[i] = offsets[i - 1] + shards[i - 1].size(); tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) { - memcpy(&buf[offsets[i]], shards[i].data(), shards[i].size()); + memcpy(buf + offsets[i], shards[i].data(), shards[i].size()); }); // Write a trailer @@ -146,10 +146,10 @@ void ZlibCompressor::write_to(u8 *buf) { static std::vector zstd_compress(std::string_view input) { std::vector buf(ZSTD_COMPRESSBOUND(input.size())); - constexpr int level = 3; // compression level; must be between 1 to 22 + constexpr int LEVEL = 3; // compression level; must be between 1 to 22 size_t sz = ZSTD_compress(buf.data(), buf.size(), input.data(), input.size(), - level); + LEVEL); assert(!ZSTD_isError(sz)); buf.resize(sz); buf.shrink_to_fit(); @@ -178,7 +178,7 @@ void ZstdCompressor::write_to(u8 *buf) { offsets[i] = offsets[i - 1] + shards[i - 1].size(); tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) { - memcpy(&buf[offsets[i]], shards[i].data(), shards[i].size()); + memcpy(buf + offsets[i], shards[i].data(), shards[i].size()); }); } diff --git a/common/config.h.in b/lib/config.h.in similarity index 75% rename from common/config.h.in rename to lib/config.h.in index 6cbe35e0..be3eec8d 100644 --- a/common/config.h.in +++ b/lib/config.h.in @@ -1,3 +1,4 @@ #define MOLD_VERSION "@mold_VERSION@" #define MOLD_LIBDIR "@CMAKE_INSTALL_FULL_LIBDIR@" -#cmakedefine01 MOLD_IS_SOLD + +#cmakedefine HAVE_MADVISE 1 diff --git a/lib/crc32.cc b/lib/crc32.cc new file mode 100644 index 00000000..d3f71783 --- /dev/null +++ b/lib/crc32.cc @@ -0,0 +1,60 @@ +#include "common.h" + +#include +#include + +namespace mold { + +// This function "forges" a CRC. That is, given the current and a desired +// CRC32 value, crc32_solve() returns a binary blob to add to the end of +// the original data to yield the desired CRC. Trailing garbage is ignored +// by many bianry file formats, so you can create a file with a desired +// CRC using crc32_solve(). We need it for --separate-debug-file. +std::vector crc32_solve(u32 current, u32 desired) { + constexpr u32 poly = 0xedb88320; + u32 x = ~desired; + + // Each iteration computes x = (x * x^-1) mod poly. + for (i64 i = 0; i < 32; i++) { + x = std::rotl(x, 1); + x ^= (x & 1) * (poly << 1); + } + + x ^= ~current; + + std::vector out(4); + out[0] = x; + out[1] = x >> 8; + out[2] = x >> 16; + out[3] = x >> 24; + return out; +} + +// Compute a CRC for given data in parallel +u32 compute_crc32(u32 crc, u8 *buf, i64 len) { + struct Shard { + u8 *buf; + i64 len; + u32 crc; + }; + + constexpr i64 shard_size = 1024 * 1024; // 1 MiB + std::vector shards; + + while (len > 0) { + i64 sz = std::min(len, shard_size); + shards.push_back({buf, sz, 0}); + buf += sz; + len -= sz; + } + + tbb::parallel_for_each(shards.begin(), shards.end(), [](Shard &shard) { + shard.crc = crc32(0, shard.buf, shard.len); + }); + + for (Shard &shard : shards) + crc = crc32_combine(crc, shard.crc, shard.len); + return crc; +} + +} // namespace mold diff --git a/common/demangle.cc b/lib/demangle.cc similarity index 54% rename from common/demangle.cc rename to lib/demangle.cc index 4bdea119..bb2a3a39 100644 --- a/common/demangle.cc +++ b/lib/demangle.cc @@ -10,25 +10,7 @@ namespace mold { -std::string_view demangle(std::string_view name) { - static thread_local char *p; - if (p) - free(p); - - // Try to demangle as a Rust symbol. Since legacy-style Rust symbols - // are also valid as a C++ mangled name, we need to call this before - // cpp_demangle. - p = rust_demangle(std::string(name).c_str(), 0); - if (p) - return p; - - // Try to demangle as a C++ symbol. - if (std::optional s = cpp_demangle(name)) - return *s; - return name; -} - -std::optional cpp_demangle(std::string_view name) { +std::optional demangle_cpp(std::string_view name) { static thread_local char *buf; static thread_local size_t buflen; @@ -48,4 +30,13 @@ std::optional cpp_demangle(std::string_view name) { return {}; } +std::optional demangle_rust(std::string_view name) { + static thread_local char *buf; + free(buf); + buf = rust_demangle(std::string(name).c_str(), 0); + if (buf) + return buf; + return {}; +} + } // namespace mold diff --git a/common/filepath.cc b/lib/filepath.cc similarity index 51% rename from common/filepath.cc rename to lib/filepath.cc index 670188e2..8376bdf6 100644 --- a/common/filepath.cc +++ b/lib/filepath.cc @@ -3,6 +3,14 @@ #include #include +#ifdef __APPLE__ +# include +#endif + +#ifdef __FreeBSD__ +# include +#endif + namespace mold { std::string get_realpath(std::string_view path) { @@ -26,4 +34,30 @@ std::filesystem::path to_abs_path(std::filesystem::path path) { return (std::filesystem::current_path() / path).lexically_normal(); } +// Returns the path of the mold executable itself +std::string get_self_path() { +#if __APPLE__ || _WIN32 + fprintf(stderr, "mold: get_self_path is not supported"); + exit(1); +#elif __FreeBSD__ + // /proc may not be mounted on FreeBSD. The proper way to get the + // current executable's path is to use sysctl(2). + int mib[4]; + mib[0] = CTL_KERN; + mib[1] = KERN_PROC; + mib[2] = KERN_PROC_PATHNAME; + mib[3] = -1; + + size_t size; + sysctl(mib, 4, NULL, &size, NULL, 0); + + std::string path; + path.resize(size); + sysctl(mib, 4, path.data(), &size, NULL, 0); + return path; +#else + return std::filesystem::read_symlink("/proc/self/exe").string(); +#endif +} + } // namespace mold diff --git a/test/gentoo-test.sh b/lib/gentoo-test.sh similarity index 86% rename from test/gentoo-test.sh rename to lib/gentoo-test.sh index 3fac9f17..dbdae006 100755 --- a/test/gentoo-test.sh +++ b/lib/gentoo-test.sh @@ -26,12 +26,13 @@ if ! docker image ls mold-gentoo | grep -q mold-gentoo; then cat <> /etc/portage/make.conf && \ +RUN echo 'USE="X ssl elogind -systemd corefonts truetype jpeg jpeg2k tiff zstd static-libs binary -perl"' >> /etc/portage/make.conf && \ echo 'ACCEPT_KEYWORDS="~amd64"' >> /etc/portage/make.conf && \ echo 'ACCEPT_LICENSE="* -@EULA"' >> /etc/portage/make.conf && \ echo 'FEATURES="\${FEATURE} noclean nostrip ccache -ipc-sandbox -network-sandbox -pid-sandbox -sandbox"' >> /etc/portage/make.conf && \ - echo 'CCACHE_DIR="/ccache"' >> /etc/portage/make.conf -RUN emerge gdb lld clang vim emacs strace ccache xeyes dev-util/cmake dev-vcs/git && rm -rf /var/tmp/portage + echo 'CCACHE_DIR="/ccache"' >> /etc/portage/make.conf && \ + emerge gdb lld clang vim emacs strace ccache xeyes dev-build/cmake dev-vcs/git && \ + rm -rf /var/tmp/portage EOF set +e fi @@ -48,7 +49,7 @@ cmd1='(cd /usr/bin; ln -sf /mold/mold $(realpath ld))' cmd2="MAKEOPTS=-'j$(nproc) --load-average=100' emerge --onlydeps $package" cmd3="MAKEOPTS='-j$(nproc) --load-average=100' FEATURES=test emerge $package" filename=`echo "$package" | sed 's!/!_!g'` -docker="docker run --rm --cap-add=SYS_PTRACE -v `pwd`:/mold -v /var/cache/ccache-gentoo:/ccache mold-gentoo timeout -v -k 15s 1h" +docker="docker run --rm --cap-add=SYS_PTRACE -v `pwd`:/mold -v /var/cache/ccache-gentoo:/ccache mold-gentoo timeout -v -k 15s 3h" dir=gentoo/$git_hash mkdir -p "$dir"/success "$dir"/failure diff --git a/common/glob.cc b/lib/glob.cc similarity index 92% rename from common/glob.cc rename to lib/glob.cc index 62eff444..dfe90b50 100644 --- a/common/glob.cc +++ b/lib/glob.cc @@ -79,6 +79,14 @@ std::optional Glob::compile(std::string_view pat) { case '*': vec.push_back({STAR}); break; + case '\\': + if (pat.empty()) + return {}; + if (vec.empty() || vec.back().kind != STRING) + vec.push_back({STRING}); + vec.back().str += pat[0]; + pat = pat.substr(1); + break; default: if (vec.empty() || vec.back().kind != STRING) vec.push_back({STRING}); @@ -101,7 +109,7 @@ bool Glob::do_match(std::string_view str, std::span elements) { switch (e.kind) { case STRING: - if (str.empty() || !str.starts_with(e.str)) + if (!str.starts_with(e.str)) return false; str = str.substr(e.str.size()); break; @@ -116,13 +124,12 @@ bool Glob::do_match(std::string_view str, std::span elements) { for (;;) { size_t pos = str.find(elements[0].str); if (pos == str.npos) - break; + return false; if (do_match(str.substr(pos + elements[0].str.size()), elements.subspan(1))) return true; str = str.substr(pos + 1); } - return false; } // Other cases are handled here. diff --git a/common/hyperloglog.cc b/lib/hyperloglog.cc similarity index 100% rename from common/hyperloglog.cc rename to lib/hyperloglog.cc diff --git a/lib/integers.h b/lib/integers.h new file mode 100644 index 00000000..11582f70 --- /dev/null +++ b/lib/integers.h @@ -0,0 +1,144 @@ +// This file defines integral types for file input/output. We need to use +// these types instead of the plain integers (such as uint32_t or int32_t) +// when reading from/writing to an mmap'ed file area for the following +// reasons: +// +// 1. mold is always a cross linker and should not depend on what host it +// is running on. For example, users should be able to run mold on a +// big-endian SPARC machine to create a little-endian RV64 binary. +// +// 2. Even though data members in all ELF data strucutres are naturally +// aligned, they are not guaranteed to be aligned on memory because of +// archive files. Archive files (.a files) align each member only to a +// 2 byte boundary, so anything larger than 2 bytes may be misaligned +// in an mmap'ed memory. Misaligned access is an undefined behavior in +// C/C++, so we shouldn't cast an arbitrary pointer to a uint32_t, for +// example, to read a 32 bit value. +// +// The data types defined in this file don't depend on host byte order and +// don't do unaligned access. + +#pragma once + +#include +#include +#include + +#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) +# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define __LITTLE_ENDIAN__ 1 +# elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define __BIG_ENDIAN__ 1 +# else +# error "unknown host byte order" +# endif +#endif + +namespace mold { + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef int8_t i8; +typedef int16_t i16; +typedef int32_t i32; +typedef int64_t i64; + +template +class Integer { +public: + constexpr Integer() = default; + + constexpr Integer(T x) requires (endian == std::endian::little && size == 2) + : buf{(u8)x, (u8)(x >> 8)} {} + + constexpr Integer(T x) requires (endian == std::endian::little && size == 3) + : buf{(u8)x, (u8)(x >> 8), (u8)(x >> 16)} {} + + constexpr Integer(T x) requires (endian == std::endian::little && size == 4) + : buf{(u8)x, (u8)(x >> 8), (u8)(x >> 16), (u8)(x >> 24)} {} + + constexpr Integer(T x) requires (endian == std::endian::little && size == 8) + : buf{(u8)x, (u8)(x >> 8), (u8)(x >> 16), (u8)(x >> 24), + (u8)(x >> 32), (u8)(x >> 40), (u8)(x >> 48), (u8)(x >> 56)} {} + + constexpr Integer(T x) requires (endian == std::endian::big && size == 2) + : buf{(u8)(x >> 8), (u8)x} {} + + constexpr Integer(T x) requires (endian == std::endian::big && size == 3) + : buf{(u8)(x >> 16), (u8)(x >> 8), (u8)x} {} + + constexpr Integer(T x) requires (endian == std::endian::big && size == 4) + : buf{(u8)(x >> 24), (u8)(x >> 16), (u8)(x >> 8), (u8)x} {} + + constexpr Integer(T x) requires (endian == std::endian::big && size == 8) + : buf{(u8)(x >> 56), (u8)(x >> 48), (u8)(x >> 40), (u8)(x >> 32), + (u8)(x >> 24), (u8)(x >> 16), (u8)(x >> 8), (u8)x} {} + + Integer &operator=(T x) { + new (this) Integer(x); + return *this; + } + + operator T() const { + if constexpr (endian == std::endian::little) { + if constexpr (size == 2) + return buf[1] << 8 | buf[0]; + else if constexpr (size == 3) + return buf[2] << 16 | buf[1] << 8 | buf[0]; + else if constexpr (size == 4) + return buf[3] << 24 | buf[2] << 16 | buf[1] << 8 | buf[0]; + else + return (u64)buf[7] << 56 | (u64)buf[6] << 48 | + (u64)buf[5] << 40 | (u64)buf[4] << 32 | + (u64)buf[3] << 24 | (u64)buf[2] << 16 | + (u64)buf[1] << 8 | (u64)buf[0]; + } else { + if constexpr (size == 2) + return buf[0] << 8 | buf[1]; + else if constexpr (size == 3) + return buf[0] << 16 | buf[1] << 8 | buf[2]; + else if constexpr (size == 4) + return buf[0] << 24 | buf[1] << 16 | buf[2] << 8 | buf[3]; + else + return (u64)buf[0] << 56 | (u64)buf[1] << 48 | + (u64)buf[2] << 40 | (u64)buf[3] << 32 | + (u64)buf[4] << 24 | (u64)buf[5] << 16 | + (u64)buf[6] << 8 | (u64)buf[7]; + } + } + + Integer &operator++() { return *this = *this + 1; } + Integer operator++(int) { return ++*this - 1; } + Integer &operator--() { return *this = *this - 1; } + Integer operator--(int) { return --*this + 1; } + Integer &operator+=(T x) { return *this = *this + x; } + Integer &operator-=(T x) { return *this = *this - x; } + Integer &operator&=(T x) { return *this = *this & x; } + Integer &operator|=(T x) { return *this = *this | x; } + +private: + u8 buf[size]; +};; + +using il16 = Integer; +using il32 = Integer; +using il64 = Integer; + +using ul16 = Integer; +using ul24 = Integer; +using ul32 = Integer; +using ul64 = Integer; + +using ib16 = Integer; +using ib32 = Integer; +using ib64 = Integer; + +using ub16 = Integer; +using ub24 = Integer; +using ub32 = Integer; +using ub64 = Integer; + +} // namespace mold diff --git a/elf/jobs.cc b/lib/jobs-unix.cc similarity index 62% rename from elf/jobs.cc rename to lib/jobs-unix.cc index 12b4a6a9..9912ab52 100644 --- a/elf/jobs.cc +++ b/lib/jobs-unix.cc @@ -9,21 +9,20 @@ // mold processes to just 1 for each user. It is intended to be used as // `MOLD_JOBS=1 ninja` or `MOLD_JOBS=1 make -j$(nproc)`. -#include "mold.h" +#include "common.h" -#ifndef _WIN32 -# include -# include -# include -# include -# include -#endif +#include +#include +#include +#include +#include +#include -namespace mold::elf { +namespace mold { -template -void acquire_global_lock(Context &ctx) { -#ifndef _WIN32 +static int lock_fd = -1; + +void acquire_global_lock() { char *jobs = getenv("MOLD_JOBS"); if (!jobs || jobs != "1"s) return; @@ -40,22 +39,12 @@ void acquire_global_lock(Context &ctx) { if (lockf(fd, F_LOCK, 0) == -1) return; - - ctx.global_lock_fd = fd; -#endif + lock_fd = fd; } -template -void release_global_lock(Context &ctx) { -#ifndef _WIN32 - if (ctx.global_lock_fd) - close(*ctx.global_lock_fd); -#endif +void release_global_lock() { + if (lock_fd != -1) + close(lock_fd); } -using E = MOLD_TARGET; - -template void acquire_global_lock(Context &); -template void release_global_lock(Context &); - -} // namespace mold::elf +} // namespace mold diff --git a/lib/jobs-win32.cc b/lib/jobs-win32.cc new file mode 100644 index 00000000..8a7c1942 --- /dev/null +++ b/lib/jobs-win32.cc @@ -0,0 +1,6 @@ +namespace mold { + +void acquire_global_lock() {} +void release_global_lock() {} + +} // namespace mold diff --git a/lib/malloc.cc b/lib/malloc.cc new file mode 100644 index 00000000..fc2f0afc --- /dev/null +++ b/lib/malloc.cc @@ -0,0 +1,5 @@ +#include "config.h" + +#ifdef USE_SYSTEM_MIMALLOC +# include +#endif diff --git a/lib/mapped-file-unix.cc b/lib/mapped-file-unix.cc new file mode 100644 index 00000000..918e6c5d --- /dev/null +++ b/lib/mapped-file-unix.cc @@ -0,0 +1,51 @@ +#include "common.h" + +namespace mold { + +MappedFile *open_file_impl(const std::string &path, std::string &error) { + i64 fd = ::open(path.c_str(), O_RDONLY); + if (fd == -1) { + if (errno != ENOENT) + error = "opening " + path + " failed: " + errno_string(); + return nullptr; + } + + struct stat st; + if (fstat(fd, &st) == -1) + error = path + ": fstat failed: " + errno_string(); + + MappedFile *mf = new MappedFile; + mf->name = path; + mf->size = st.st_size; + + if (st.st_size > 0) { + mf->data = (u8 *)mmap(nullptr, st.st_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (mf->data == MAP_FAILED) + error = path + ": mmap failed: " + errno_string(); + } + + close(fd); + return mf; +} + +void MappedFile::unmap() { + if (size == 0 || parent || !data) + return; + munmap(data, size); + data = nullptr; +} + +void MappedFile::close_fd() { + if (fd == -1) + return; + close(fd); + fd = -1; +} + +void MappedFile::reopen_fd(const std::string &path) { + if (fd == -1) + fd = open(path.c_str(), O_RDONLY); +} + +} // namespace mold diff --git a/lib/mapped-file-win32.cc b/lib/mapped-file-win32.cc new file mode 100644 index 00000000..93c3ef4b --- /dev/null +++ b/lib/mapped-file-win32.cc @@ -0,0 +1,81 @@ +#include "common.h" + +namespace mold { + +MappedFile *open_file_impl(const std::string &path, std::string &error) { + HANDLE fd = CreateFileA(path.c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); + + if (fd == INVALID_HANDLE_VALUE) { + auto err = GetLastError(); + if (err != ERROR_FILE_NOT_FOUND) + error = "opening " + path + " failed: " + errno_string(); + return nullptr; + } + + if (GetFileType(fd) != FILE_TYPE_DISK) { + CloseHandle(fd); + return nullptr; + } + + DWORD size_hi; + DWORD size_lo = GetFileSize(fd, &size_hi); + + if (size_lo == INVALID_FILE_SIZE) { + error = path + ": GetFileSize failed: " + errno_string(); + return nullptr; + } + + u64 size = ((u64)size_hi << 32) + size_lo; + + MappedFile *mf = new MappedFile; + mf->name = path; + mf->size = size; + mf->fd = fd; + + if (size > 0) { + HANDLE h = CreateFileMapping(fd, nullptr, PAGE_READONLY, 0, size, nullptr); + if (!h) { + error = path + ": CreateFileMapping failed: " + errno_string(); + return nullptr; + } + + mf->data = (u8 *)MapViewOfFile(h, FILE_MAP_COPY, 0, 0, size); + CloseHandle(h); + + if (!mf->data) { + error = path + ": MapViewOfFile failed: " + errno_string(); + return nullptr; + } + } + + return mf; +} + +void MappedFile::unmap() { + if (size == 0 || parent || !data) + return; + + UnmapViewOfFile(data); + if (fd != INVALID_HANDLE_VALUE) + CloseHandle(fd); + + data = nullptr; +} + +void MappedFile::close_fd() { + if (fd == INVALID_HANDLE_VALUE) + return; + CloseHandle(fd); + fd = INVALID_HANDLE_VALUE; +} + +void MappedFile::reopen_fd(const std::string &path) { + if (fd == INVALID_HANDLE_VALUE) + fd = CreateFileA(path.c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); +} + +} // namespace mold diff --git a/common/multi-glob.cc b/lib/multi-glob.cc similarity index 72% rename from common/multi-glob.cc rename to lib/multi-glob.cc index 07dad9bb..e7688005 100644 --- a/common/multi-glob.cc +++ b/lib/multi-glob.cc @@ -24,44 +24,54 @@ namespace mold { -std::optional MultiGlob::find(std::string_view str) { +std::optional MultiGlob::find(std::string_view str) { std::call_once(once, [&] { compile(); }); - u32 val = UINT32_MAX; + i64 val = -1; - if (root) { - // Match against simple glob patterns - TrieNode *node = root.get(); - - auto walk = [&](u8 c) { - for (;;) { - if (node->children[c]) { - node = node->children[c].get(); - val = std::min(val, node->value); - return; - } - - if (!node->suffix_link) - return; - node = node->suffix_link; - } - }; - - walk('\0'); - for (u8 c : str) - walk(c); - walk('\0'); - } + // Match against simple glob patterns + if (root) + val = find_aho_corasick(str); // Match against complex glob patterns - for (std::pair &glob : globs) + for (std::pair &glob : globs) if (glob.first.match(str)) - val = std::min(val, glob.second); + val = std::max(val, glob.second); - if (val == UINT32_MAX) + if (val == -1) return {}; return val; } +i64 MultiGlob::find_aho_corasick(std::string_view str) { + TrieNode *node = root.get(); + i64 val = -1; + + auto walk = [&](u8 c) { + for (;;) { + if (node->children[c]) { + node = node->children[c].get(); + val = std::max(val, node->value); + return; + } + + if (!node->suffix_link) + return; + node = node->suffix_link; + } + }; + + walk('\0'); + + for (u8 c : str) { + if (prefix_match && node == root.get()) + return val; + walk(c); + } + + walk('\0'); + return val; +} + static bool is_simple_pattern(std::string_view pat) { static std::regex re(R"(\*?[^*[?]+\*?)", std::regex_constants::optimize); return std::regex_match(pat.begin(), pat.end(), re); @@ -82,7 +92,7 @@ static std::string handle_stars(std::string_view pat) { return "\0"s + str + "\0"s; } -bool MultiGlob::add(std::string_view pat, u32 val) { +bool MultiGlob::add(std::string_view pat, i64 val) { assert(!is_compiled); assert(!pat.empty()); @@ -108,7 +118,7 @@ bool MultiGlob::add(std::string_view pat, u32 val) { node = node->children[c].get(); } - node->value = std::min(node->value, val); + node->value = std::max(node->value, val); return true; } @@ -117,6 +127,16 @@ void MultiGlob::compile() { if (root) { fix_suffix_links(*root); fix_values(); + + // If no pattern starts with '*', set prefix_match to true. + // We'll use this flag for optimization. + prefix_match = true; + for (i64 i = 1; i < 256; i++) { + if (root->children[i]) { + prefix_match = false; + break; + } + } } } @@ -157,7 +177,7 @@ void MultiGlob::fix_values() { for (std::unique_ptr &child : node->children) { if (!child) continue; - child->value = std::min(child->value, child->suffix_link->value); + child->value = std::max(child->value, child->suffix_link->value); queue.push(child.get()); } } while (!queue.empty()); diff --git a/common/perf.cc b/lib/perf.cc similarity index 92% rename from common/perf.cc rename to lib/perf.cc index 5d5eb263..dfc4c73c 100644 --- a/common/perf.cc +++ b/lib/perf.cc @@ -26,19 +26,13 @@ void Counter::print() { } static i64 now_nsec() { -#ifdef _WIN32 return (i64)std::chrono::steady_clock::now().time_since_epoch().count(); -#else - struct timespec t; - clock_gettime(CLOCK_MONOTONIC, &t); - return (i64)t.tv_sec * 1'000'000'000 + t.tv_nsec; -#endif } static std::pair get_usage() { #ifdef _WIN32 auto to_nsec = [](FILETIME t) -> i64 { - return ((u64)t.dwHighDateTime << 32 + (u64)t.dwLowDateTime) * 100; + return (((u64)t.dwHighDateTime << 32) + (u64)t.dwLowDateTime) * 100; }; FILETIME creation, exit, kernel, user; diff --git a/lib/random.cc b/lib/random.cc new file mode 100644 index 00000000..6eca7727 --- /dev/null +++ b/lib/random.cc @@ -0,0 +1,20 @@ +#include "common.h" + +#include + +namespace mold { + +void get_random_bytes(u8 *buf, i64 size) { + std::random_device rand; + i64 i = 0; + + for (; i < size - 4; i += 4) { + u32 val = rand(); + memcpy(buf + i, &val, 4); + } + + u32 val = rand(); + memcpy(buf + i, &val, size - i); +} + +} // namespace mold diff --git a/lib/signal-unix.cc b/lib/signal-unix.cc new file mode 100644 index 00000000..b8ce82dd --- /dev/null +++ b/lib/signal-unix.cc @@ -0,0 +1,88 @@ +#include "common.h" + +#include +#include + +#ifdef __FreeBSD__ +# include +# include +#endif + +namespace mold { + +std::string errno_string() { + // strerror is not thread-safe, so guard it with a lock. + static std::mutex mu; + std::scoped_lock lock(mu); + return strerror(errno); +} + +void cleanup() { + if (output_tmpfile) + unlink(output_tmpfile); +} + +// mold mmap's an output file, and the mmap succeeds even if there's +// no enough space left on the filesystem. The actual disk blocks are +// not allocated on the mmap call but when the program writes to it +// for the first time. +// +// If a disk becomes full as a result of a write to an mmap'ed memory +// region, the failure of the write is reported as a SIGBUS or structured +// exeption with code EXCEPTION_IN_PAGE_ERROR on Windows. This +// signal handler catches that signal and prints out a user-friendly +// error message. Without this, it is very hard to realize that the +// disk might be full. +static std::string sigabrt_msg; + +static void sighandler(int signo, siginfo_t *info, void *ucontext) { + static std::mutex mu; + std::scoped_lock lock{mu}; + + // Handle disk full error + switch (signo) { + case SIGSEGV: + case SIGBUS: + if (output_buffer_start <= info->si_addr && + info->si_addr < output_buffer_end) { + const char msg[] = "mold: failed to write to an output file. Disk full?\n"; + (void)!write(STDERR_FILENO, msg, sizeof(msg) - 1); + } + break; + case SIGABRT: { + (void)!write(STDERR_FILENO, &sigabrt_msg[0], sigabrt_msg.size()); + break; + } + } + + // Re-throw the signal + signal(SIGSEGV, SIG_DFL); + signal(SIGBUS, SIG_DFL); + signal(SIGABRT, SIG_DFL); + + cleanup(); + raise(signo); +} + +void install_signal_handler() { + struct sigaction action; + action.sa_sigaction = sighandler; + sigemptyset(&action.sa_mask); + action.sa_flags = SA_SIGINFO; + + sigaction(SIGSEGV, &action, NULL); + sigaction(SIGBUS, &action, NULL); + + // OneTBB 2021.9.0 has the interface version 12090. + if (TBB_runtime_interface_version() < 12090) { + sigabrt_msg = "mold: aborted\n" + "mold: mold with libtbb version 2021.9.0 or older is known to be unstable " + "under heavy load. Your libtbb version is " + + std::string(TBB_runtime_version()) + + ". Please upgrade your libtbb library and try again.\n"; + + sigaction(SIGABRT, &action, NULL); + } +} + +} // namespace mold diff --git a/lib/signal-win32.cc b/lib/signal-win32.cc new file mode 100644 index 00000000..a6e13bce --- /dev/null +++ b/lib/signal-win32.cc @@ -0,0 +1,55 @@ +#include "common.h" + +#include + +namespace mold { + +void cleanup() { + if (output_tmpfile) + _unlink(output_tmpfile); +} + +std::string errno_string() { + LPVOID buf; + DWORD dw = GetLastError(); + + FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + nullptr, dw, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPTSTR)&buf, 0, nullptr); + + std::string ret = (char *)buf; + LocalFree(buf); + return ret; +} + +static LONG WINAPI vectored_handler(_EXCEPTION_POINTERS *exception_info) { + static std::mutex mu; + std::scoped_lock lock{mu}; + + PEXCEPTION_RECORD rec = exception_info->ExceptionRecord; + ULONG_PTR *p = rec->ExceptionInformation; + + if (rec->ExceptionCode == EXCEPTION_IN_PAGE_ERROR && + (ULONG_PTR)output_buffer_start <= p[1] && + p[1] < (ULONG_PTR)output_buffer_end) { + static const char msg[] = + "mold: failed to write to an output file. Disk full?\n"; + (void)!_write(_fileno(stderr), msg, sizeof(msg) - 1); + } else if (rec->ExceptionCode == EXCEPTION_STACK_OVERFLOW) { + static const char msg[] = + "mold: stack overflow\n"; + (void)!_write(_fileno(stderr), msg, sizeof(msg) - 1); + } + + cleanup(); + _exit(1); +} + +void install_signal_handler() { + AddVectoredExceptionHandler(0, vectored_handler); +} + +} // namespace mold diff --git a/lib/siphash.h b/lib/siphash.h new file mode 100644 index 00000000..4837cdf1 --- /dev/null +++ b/lib/siphash.h @@ -0,0 +1,144 @@ +// This is a header-only C++20 implementation of SipHash based on the +// reference implementation. To use, just copy this header file into +// your project and #include it. +// +// https://github.com/rui314/siphash/blob/main/siphash.h + +#include +#include +#include + +template +class SipHashTmpl { +public: + static_assert(OUTLEN == 64 || OUTLEN == 128); + + SipHashTmpl(void *key) { + uint64_t k0 = read64(key); + uint64_t k1 = read64((char *)key + 8); + + v0 = 0x736f6d6570736575 ^ k0; + v1 = 0x646f72616e646f6d ^ k1; + v2 = 0x6c7967656e657261 ^ k0; + v3 = 0x7465646279746573 ^ k1; + + if (OUTLEN == 128) + v1 ^= 0xee; + } + + void update(void *msgp, int64_t msglen) { + char *msg = (char *)msgp; + sum += msglen; + + if (buflen) { + if (buflen + msglen < 8) { + memcpy(buf + buflen, msg, msglen); + buflen += msglen; + return; + } + + int j = 8 - buflen; + memcpy(buf + buflen, msg, j); + compress(read64(buf)); + + msg += j; + msglen -= j; + buflen = 0; + } + + while (msglen >= 8) { + compress(read64(msg)); + msg += 8; + msglen -= 8; + } + + memcpy(buf, msg, msglen); + buflen = msglen; + } + + void finish(void *out) { + memset(buf + buflen, 0, 8 - buflen); + compress(((uint64_t)sum << 56) | read64(buf)); + + v2 ^= (OUTLEN == 128) ? 0xee : 0xff; + finalize(); + write64(out, v0 ^ v1 ^ v2 ^ v3); + + if (OUTLEN == 128) { + v1 ^= 0xdd; + finalize(); + write64((char *)out + 8, v0 ^ v1 ^ v2 ^ v3); + } + } + + static void hash(void *out, void *key, void *in, int inlen) { + SipHashTmpl h(key); + h.update(in, inlen); + h.finish(out); + } + +private: + uint64_t v0, v1, v2, v3; + uint8_t buf[8]; + uint8_t buflen = 0; + uint8_t sum = 0; + + uint64_t read64(void *loc) { + uint64_t val; + memcpy(&val, loc, 8); + if (std::endian::native == std::endian::big) + val = bswap(val); + return val; + } + + void write64(void *loc, uint64_t val) { + if (std::endian::native == std::endian::big) + val = bswap(val); + memcpy(loc, &val, 8); + } + + uint64_t bswap(uint64_t val) { + return ((val << 56) & 0xff00000000000000) | + ((val << 40) & 0x00ff000000000000) | + ((val << 24) & 0x0000ff0000000000) | + ((val << 8) & 0x000000ff00000000) | + ((val >> 8) & 0x00000000ff000000) | + ((val >> 24) & 0x0000000000ff0000) | + ((val >> 40) & 0x000000000000ff00) | + ((val >> 56) & 0x00000000000000ff); + } + + void round() { + v0 += v1; + v1 = std::rotl(v1, 13); + v1 ^= v0; + v0 = std::rotl(v0, 32); + v2 += v3; + v3 = std::rotl(v3, 16); + v3 ^= v2; + v0 += v3; + v3 = std::rotl(v3, 21); + v3 ^= v0; + v2 += v1; + v1 = std::rotl(v1, 17); + v1 ^= v2; + v2 = std::rotl(v2, 32); + } + + void compress(uint64_t m) { + v3 ^= m; + for (int i = 0; i < C_ROUNDS; i++) + round(); + v0 ^= m; + } + + void finalize() { + for (int i = 0; i < D_ROUNDS; i++) + round(); + } +}; + +using SipHash = SipHashTmpl<2, 4, 64>; +using SipHash128 = SipHashTmpl<2, 4, 128>; +using SipHash13 = SipHashTmpl<1, 3, 64>; +using SipHash13_128 = SipHashTmpl<1, 3, 128>; diff --git a/common/tar.cc b/lib/tar.cc similarity index 70% rename from common/tar.cc rename to lib/tar.cc index 9ad9fa0e..30f464bc 100644 --- a/common/tar.cc +++ b/lib/tar.cc @@ -1,7 +1,15 @@ +// This file contains functions to create a tar file. + #include "common.h" +#ifdef _WIN32 +# define ftruncate _chsize_s +#endif + namespace mold { +static constexpr i64 BLOCK_SIZE = 512; + // A tar file consists of one or more Ustar header followed by data. // Each Ustar header represents a single file in an archive. // @@ -11,26 +19,6 @@ namespace mold { // // For simplicity, we always emit a PAX header even for a short filename. struct UstarHeader { - UstarHeader() { - memset(this, 0, sizeof(*this)); - } - - void finalize() { - memset(checksum, ' ', sizeof(checksum)); - memcpy(magic, "ustar", 5); - memcpy(version, "00", 2); - - // Compute checksum - int sum = 0; - for (i64 i = 0; i < sizeof(*this); i++) - sum += ((u8 *)this)[i]; - - // We need to convince the compiler that sum isn't too big to silence - // -Werror=format-truncation. - ASSUME(sum < 01'000'000); - snprintf(checksum, sizeof(checksum), "%06o", sum); - } - char name[100]; char mode[8]; char uid[8]; @@ -50,7 +38,24 @@ struct UstarHeader { char pad[12]; }; -static_assert(sizeof(UstarHeader) == 512); +static_assert(sizeof(UstarHeader) == BLOCK_SIZE); + +static void finalize(UstarHeader &hdr) { + memset(hdr.checksum, ' ', sizeof(hdr.checksum)); + memcpy(hdr.magic, "ustar", 5); + memcpy(hdr.version, "00", 2); + + // Compute checksum + int sum = 0; + for (i64 i = 0; i < sizeof(hdr); i++) + sum += ((u8 *)&hdr)[i]; + + // We need to convince the compiler that sum isn't too big to silence + // -Werror=format-truncation. + if (sum >= 01'000'000) + unreachable(); + snprintf(hdr.checksum, sizeof(hdr.checksum), "%06o", sum); +} static std::string encode_path(std::string basedir, std::string path) { path = path_clean(basedir + "/" + path); @@ -78,14 +83,13 @@ TarWriter::~TarWriter() { void TarWriter::append(std::string path, std::string_view data) { // Write PAX header - static_assert(sizeof(UstarHeader) == BLOCK_SIZE); - UstarHeader pax; + UstarHeader pax = {}; std::string attr = encode_path(basedir, path); snprintf(pax.size, sizeof(pax.size), "%011zo", attr.size()); pax.name[0] = '/'; pax.typeflag[0] = 'x'; - pax.finalize(); + finalize(pax); fwrite(&pax, sizeof(pax), 1, out); // Write pathname @@ -93,23 +97,18 @@ void TarWriter::append(std::string path, std::string_view data) { fseek(out, align_to(ftell(out), BLOCK_SIZE), SEEK_SET); // Write Ustar header - UstarHeader ustar; + UstarHeader ustar = {}; memcpy(ustar.mode, "0000664", 8); snprintf(ustar.size, sizeof(ustar.size), "%011zo", data.size()); - ustar.finalize(); + finalize(ustar); fwrite(&ustar, sizeof(ustar), 1, out); // Write file contents fwrite(data.data(), data.size(), 1, out); fseek(out, align_to(ftell(out), BLOCK_SIZE), SEEK_SET); - // A tar file must ends with two empty blocks, so write such - // terminator and seek back. - u8 terminator[BLOCK_SIZE * 2] = {}; - fwrite(&terminator, BLOCK_SIZE * 2, 1, out); - fseek(out, -BLOCK_SIZE * 2, SEEK_END); - - assert(ftell(out) % BLOCK_SIZE == 0); + // A tar file must ends with two empty blocks + (void)!ftruncate(fileno(out), ftell(out) + BLOCK_SIZE * 2); } } // namespace mold diff --git a/common/update-git-hash.cmake b/lib/update-git-hash.cmake similarity index 100% rename from common/update-git-hash.cmake rename to lib/update-git-hash.cmake diff --git a/elf/arch-arm32.cc b/src/arch-arm32.cc similarity index 88% rename from elf/arch-arm32.cc rename to src/arch-arm32.cc index 3ad58d8e..7ef37392 100644 --- a/elf/arch-arm32.cc +++ b/src/arch-arm32.cc @@ -37,7 +37,7 @@ #include #include -namespace mold::elf { +namespace mold { using E = ARM32; @@ -194,7 +194,7 @@ void write_addend(u8 *loc, i64 val, const ElfRel &rel) { template <> void write_plt_header(Context &ctx, u8 *buf) { - static const ul32 insn[] = { + constexpr ul32 insn[] = { 0xe52d'e004, // push {lr} 0xe59f'e004, // ldr lr, 2f 0xe08f'e00e, // 1: add lr, pc, lr @@ -209,7 +209,7 @@ void write_plt_header(Context &ctx, u8 *buf) { *(ul32 *)(buf + 16) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 16; } -static const ul32 plt_entry[] = { +constexpr ul32 plt_entry[] = { 0xe59f'c004, // 1: ldr ip, 2f 0xe08c'c00f, // add ip, ip, pc 0xe59c'f000, // ldr pc, [ip] @@ -228,11 +228,24 @@ void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { *(ul32 *)(buf + 12) = sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx) - 12; } -// ARM does not use .eh_frame for exception handling. Instead, it uses -// .ARM.exidx and .ARM.extab. So this function is empty. template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, - u64 offset, u64 val) {} + u64 offset, u64 val) { + u8 *loc = ctx.buf + this->shdr.sh_offset + offset; + + switch (rel.r_type) { + case R_NONE: + break; + case R_ARM_ABS32: + *(ul32 *)loc = val; + break; + case R_ARM_REL32: + *(ul32 *)loc = val - this->shdr.sh_addr - offset; + break; + default: + Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; + } +} // ARM and Thumb branch instructions can jump within ±16 MiB. static bool is_jump_reachable(i64 val) { @@ -243,11 +256,6 @@ template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - auto get_tls_trampoline_addr = [&, i = 0](u64 addr) mutable { for (; i < output_section->thunks.size(); i++) { i64 disp = output_section->shdr.sh_addr + output_section->thunks[i]->offset - @@ -286,7 +294,6 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { switch (rel.r_type) { case R_ARM_ABS32: case R_ARM_TARGET1: - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); break; case R_ARM_REL32: *(ul32 *)loc = S + A - P; @@ -476,19 +483,21 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // .L2: .word foo + . - .L1 // R_ARM_TLS_GOTDESC // - // We may relax the instructions to the following for non-dlopen'd DSO + // We may relax the instructions to the following if its TP-relative + // address is known at link-time // // ldr r0, .L2 - // .L1: ldr r0, [pc, r0] + // .L1: nop // ... - // .L2: .word foo(gottpoff) + . - .L1 + // .L2: .word foo(tpoff) // - // or to the following for executable. + // or to the following if the TP-relative address is known at + // process startup time. // // ldr r0, .L2 - // .L1: nop + // .L1: ldr r0, [pc, r0] // ... - // .L2: .word foo(tpoff) + // .L2: .word foo(gottpoff) + . - .L1 if (sym.has_tlsdesc(ctx)) { // A is odd if the corresponding TLS_CALL is Thumb. *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) - P + A - ((A & 1) ? 6 : 4); @@ -571,8 +580,6 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); std::span> rels = get_rels(ctx); // Scan relocations @@ -587,11 +594,9 @@ void InputSection::scan_relocations(Context &ctx) { sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { - case R_ARM_ABS32: - case R_ARM_MOVT_ABS: - case R_ARM_THM_MOVT_ABS: - case R_ARM_TARGET1: - scan_dyn_absrel(ctx, sym, rel); + case R_ARM_MOVW_ABS_NC: + case R_ARM_THM_MOVW_ABS_NC: + scan_absrel(ctx, sym, rel); break; case R_ARM_THM_CALL: case R_ARM_CALL: @@ -627,15 +632,17 @@ void InputSection::scan_relocations(Context &ctx) { case R_ARM_TLS_LE32: check_tlsle(ctx, sym, rel); break; + case R_ARM_ABS32: + case R_ARM_MOVT_ABS: + case R_ARM_THM_MOVT_ABS: + case R_ARM_TARGET1: case R_ARM_REL32: case R_ARM_BASE_PREL: case R_ARM_GOTOFF32: case R_ARM_THM_JUMP11: case R_ARM_THM_JUMP19: case R_ARM_MOVW_PREL_NC: - case R_ARM_MOVW_ABS_NC: case R_ARM_THM_MOVW_PREL_NC: - case R_ARM_THM_MOVW_ABS_NC: case R_ARM_TLS_LDO32: case R_ARM_V4BX: case R_ARM_TLS_GOTDESC: @@ -647,11 +654,11 @@ void InputSection::scan_relocations(Context &ctx) { } template <> -void RangeExtensionThunk::copy_buf(Context &ctx) { +void Thunk::copy_buf(Context &ctx) { // TLS trampoline code. ARM32's TLSDESC is designed so that this // common piece of code is factored out from object files to reduce // output size. Since no one provide, the linker has to synthesize it. - static ul32 hdr[] = { + constexpr ul32 hdr[] = { 0xe08e'0000, // add r0, lr, r0 0xe590'1004, // ldr r1, [r0, #4] 0xe12f'ff11, // bx r1 @@ -660,7 +667,7 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { // This is a range extension and mode switch thunk. // It has two entry points: +0 for Thumb and +4 for ARM. - const u8 entry[] = { + static const u8 entry[] = { // .thumb 0x78, 0x47, // bx pc # jumps to 1f 0xc0, 0x46, // nop @@ -693,6 +700,45 @@ u64 get_eflags(Context &ctx) { return EF_ARM_EABI_VER5; } +void create_arm_exidx_section(Context &ctx) { + for (i64 i = 0; i < ctx.chunks.size(); i++) { + OutputSection *osec = ctx.chunks[i]->to_osec(); + + if (osec && osec->shdr.sh_type == SHT_ARM_EXIDX) { + auto *sec = new Arm32ExidxSection(*osec); + ctx.extra.exidx = sec; + ctx.chunks[i] = sec; + ctx.chunk_pool.emplace_back(sec); + + for (InputSection *isec : osec->members) + isec->is_alive = false; + break; + } + } +} + +void Arm32ExidxSection::compute_section_size(Context &ctx) { + output_section.compute_section_size(ctx); + this->shdr.sh_size = output_section.shdr.sh_size; +} + +void Arm32ExidxSection::update_shdr(Context &ctx) { + // .ARM.exidx's sh_link should be set to the .text section index. + // Runtime doesn't care about it, but the binutils's strip command does. + if (Chunk *chunk = find_chunk(ctx, ".text")) + this->shdr.sh_link = chunk->shndx; +} + +void Arm32ExidxSection::remove_duplicate_entries(Context &ctx) { + this->shdr.sh_size = get_contents(ctx).size(); +} + +void Arm32ExidxSection::copy_buf(Context &ctx) { + std::vector contents = get_contents(ctx); + assert(this->shdr.sh_size = contents.size()); + write_vector(ctx.buf + this->shdr.sh_offset, contents); +} + // ARM executables use an .ARM.exidx section to look up an exception // handling record for the current instruction pointer. The table needs // to be sorted by their addresses. @@ -701,13 +747,12 @@ u64 get_eflags(Context &ctx) { // I don't know why only ARM uses the different mechanism, but it's // likely that it's due to some historical reason. // -// This function sorts .ARM.exidx records. -void fixup_arm_exidx_section(Context &ctx) { - Timer t(ctx, "fixup_arm_exidx_section"); +// This function returns contents of .ARM.exidx. +std::vector Arm32ExidxSection::get_contents(Context &ctx) { + std::vector buf(output_section.shdr.sh_size); - OutputSection *osec = find_section(ctx, SHT_ARM_EXIDX); - if (!osec) - return; + output_section.shdr.sh_addr = this->shdr.sh_addr; + output_section.write_to(ctx, buf.data(), nullptr); // .ARM.exidx records consists of a signed 31-bit relative address // and a 32-bit value. The relative address indicates the start @@ -721,24 +766,24 @@ void fixup_arm_exidx_section(Context &ctx) { // // CANTUNWIND is value 1. The most significant bit is set in (2) but // not in (3). So we can distinguished them just by looking at a value. - const u32 EXIDX_CANTUNWIND = 1; + const u32 CANTUNWIND = 1; struct Entry { ul32 addr; ul32 val; }; - if (osec->shdr.sh_size % sizeof(Entry)) + if (buf.size() % sizeof(Entry)) Fatal(ctx) << "invalid .ARM.exidx section size"; - Entry *ent = (Entry *)(ctx.buf + osec->shdr.sh_offset); - i64 num_entries = osec->shdr.sh_size / sizeof(Entry); + Entry *ent = (Entry *)buf.data(); + i64 num_entries = buf.size() / sizeof(Entry); // Entry's addresses are relative to themselves. In order to sort - // records by addresses, we first translate them so that the addresses + // records by address, we first translate them so that the addresses // are relative to the beginning of the section. auto is_relative = [](u32 val) { - return val != EXIDX_CANTUNWIND && !(val & 0x8000'0000); + return val != CANTUNWIND && !(val & 0x8000'0000); }; tbb::parallel_for((i64)0, num_entries, [&](i64 i) { @@ -748,10 +793,21 @@ void fixup_arm_exidx_section(Context &ctx) { ent[i].val = 0x7fff'ffff & (ent[i].val + offset); }); - tbb::parallel_sort(ent, ent + num_entries, [](const Entry &a, const Entry &b) { + std::sort(ent, ent + num_entries, [](const Entry &a, const Entry &b) { return a.addr < b.addr; }); + // Remove duplicate adjacent entries. That is, if two adjacent functions + // have the same compact unwind info or are both CANTUNWIND, we can + // merge them into a single address range. + auto it = std::unique(ent, ent + num_entries, + [](const Entry &a, const Entry &b) { + return a.val == b.val; + }); + + num_entries = it - ent; + buf.resize(num_entries * sizeof(Entry)); + // Make addresses relative to themselves. tbb::parallel_for((i64)0, num_entries, [&](i64 i) { i64 offset = sizeof(Entry) * i; @@ -760,14 +816,7 @@ void fixup_arm_exidx_section(Context &ctx) { ent[i].val = 0x7fff'ffff & (ent[i].val - offset); }); - // .ARM.exidx's sh_link should be set to the .text section index. - // Runtime doesn't care about it, but the binutils's strip command does. - if (ctx.shdr) { - if (Chunk *text = find_section(ctx, ".text")) { - osec->shdr.sh_link = text->shndx; - ctx.shdr->copy_buf(ctx); - } - } + return buf; } -} // namespace mold::elf +} // namespace mold diff --git a/elf/arch-arm64.cc b/src/arch-arm64.cc similarity index 95% rename from elf/arch-arm64.cc rename to src/arch-arm64.cc index 75fddba0..6fc237b8 100644 --- a/elf/arch-arm64.cc +++ b/src/arch-arm64.cc @@ -19,7 +19,7 @@ #include "mold.h" -namespace mold::elf { +namespace mold { using E = ARM64; @@ -46,16 +46,16 @@ static u64 page(u64 val) { template <> void write_plt_header(Context &ctx, u8 *buf) { - static const ul32 insn[] = { + constexpr ul32 insn[] = { 0xa9bf'7bf0, // stp x16, x30, [sp,#-16]! 0x9000'0010, // adrp x16, .got.plt[2] 0xf940'0211, // ldr x17, [x16, .got.plt[2]] 0x9100'0210, // add x16, x16, .got.plt[2] 0xd61f'0220, // br x17 - 0xd503'201f, // nop - 0xd503'201f, // nop - 0xd503'201f, // nop - }; + 0xd420'7d00, // brk + 0xd420'7d00, // brk + 0xd420'7d00, // brk + }; u64 gotplt = ctx.gotplt->shdr.sh_addr + 16; u64 plt = ctx.plt->shdr.sh_addr; @@ -68,7 +68,7 @@ void write_plt_header(Context &ctx, u8 *buf) { template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { - static const ul32 insn[] = { + constexpr ul32 insn[] = { 0x9000'0010, // adrp x16, .got.plt[n] 0xf940'0211, // ldr x17, [x16, .got.plt[n]] 0x9100'0210, // add x16, x16, .got.plt[n] @@ -86,11 +86,11 @@ void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { - static const ul32 insn[] = { + constexpr ul32 insn[] = { 0x9000'0010, // adrp x16, GOT[n] 0xf940'0211, // ldr x17, [x16, GOT[n]] 0xd61f'0220, // br x17 - 0xd503'201f, // nop + 0xd420'7d00, // brk }; u64 got = sym.get_got_pltgot_addr(ctx); @@ -145,11 +145,6 @@ template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) @@ -173,7 +168,6 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { switch (rel.r_type) { case R_AARCH64_ABS64: - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); break; case R_AARCH64_LDST8_ABS_LO12_NC: case R_AARCH64_ADD_ABS_LO12_NC: @@ -383,19 +377,21 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // blr x1 // R_AARCH64_TLSDESC_CALL foo // - // We may relax the instructions to the following for non-dlopen'd DSO + // We may relax the instructions to the following if its TP-relative + // address is known at link-time // // nop // nop - // adrp x0, :gottprel:foo - // ldr x0, [x0, :gottprel_lo12:foo] + // movz x0, :tls_offset_hi:foo, lsl #16 + // movk x0, :tls_offset_lo:foo // - // or to the following for executable. + // or to the following if the TP-relative address is known at + // process startup time. // // nop // nop - // movz x0, :tls_offset_hi:foo, lsl #16 - // movk x0, :tls_offset_lo:foo + // adrp x0, :gottprel:foo + // ldr x0, [x0, :gottprel_lo12:foo] if (sym.has_tlsdesc(ctx)) { i64 val = page(sym.get_tlsdesc_addr(ctx) + A) - page(P); check(val, -(1LL << 32), 1LL << 32); @@ -488,8 +484,6 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); std::span> rels = get_rels(ctx); // Scan relocations @@ -505,9 +499,6 @@ void InputSection::scan_relocations(Context &ctx) { sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { - case R_AARCH64_ABS64: - scan_dyn_absrel(ctx, sym, rel); - break; case R_AARCH64_MOVW_UABS_G3: scan_absrel(ctx, sym, rel); break; @@ -567,6 +558,7 @@ void InputSection::scan_relocations(Context &ctx) { case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: check_tlsle(ctx, sym, rel); break; + case R_AARCH64_ABS64: case R_AARCH64_ADD_ABS_LO12_NC: case R_AARCH64_ADR_PREL_LO21: case R_AARCH64_CONDBR19: @@ -602,12 +594,12 @@ void InputSection::scan_relocations(Context &ctx) { } template <> -void RangeExtensionThunk::copy_buf(Context &ctx) { - static const ul32 insn[] = { +void Thunk::copy_buf(Context &ctx) { + constexpr ul32 insn[] = { 0x9000'0010, // adrp x16, 0 # R_AARCH64_ADR_PREL_PG_HI21 0x9100'0210, // add x16, x16 # R_AARCH64_ADD_ABS_LO12_NC 0xd61f'0200, // br x16 - 0xd503'201f, // nop + 0xd420'7d00, // brk }; static_assert(E::thunk_size == sizeof(insn)); @@ -626,4 +618,4 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { } } -} // namespace mold::elf +} // namespace mold diff --git a/elf/arch-i386.cc b/src/arch-i386.cc similarity index 85% rename from elf/arch-i386.cc rename to src/arch-i386.cc index 5c534256..008faaf0 100644 --- a/elf/arch-i386.cc +++ b/src/arch-i386.cc @@ -35,7 +35,7 @@ #include "mold.h" -namespace mold::elf { +namespace mold { using E = I386; @@ -226,7 +226,7 @@ static void relax_gd_to_le(u8 *loc, ElfRel rel, u64 val) { } // Relax LD to LE -static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 val) { +static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 tls_size) { switch (rel.r_type) { case R_386_PLT32: case R_386_PC32: { @@ -235,7 +235,7 @@ static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 val) { 0x2d, 0, 0, 0, 0, // sub $tls_size, %eax }; memcpy(loc - 2, insn, sizeof(insn)); - *(ul32 *)(loc + 5) = val; + *(ul32 *)(loc + 5) = tls_size; break; } case R_386_GOT32: @@ -246,7 +246,7 @@ static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 val) { 0x90, // nop }; memcpy(loc - 2, insn, sizeof(insn)); - *(ul32 *)(loc + 5) = val; + *(ul32 *)(loc + 5) = tls_size; break; } default: @@ -254,15 +254,38 @@ static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 val) { } } +static u32 relax_tlsdesc_to_ie(u8 *loc) { + switch ((loc[0] << 8) | loc[1]) { + case 0x8d83: return 0x8b83; // lea 0(%ebx), %eax -> mov 0(%ebx), %eax + case 0x8d9b: return 0x8b9b; // lea 0(%ebx), %ebx -> mov 0(%ebx), %ebx + case 0x8d8b: return 0x8b8b; // lea 0(%ebx), %ecx -> mov 0(%ebx), %ecx + case 0x8d93: return 0x8b93; // lea 0(%ebx), %edx -> mov 0(%ebx), %edx + case 0x8db3: return 0x8bb3; // lea 0(%ebx), %esi -> mov 0(%ebx), %esi + case 0x8dbb: return 0x8bbb; // lea 0(%ebx), %edi -> mov 0(%ebx), %edi + case 0x8da3: return 0x8ba3; // lea 0(%ebx), %esp -> mov 0(%ebx), %esp + case 0x8dab: return 0x8bab; // lea 0(%ebx), %ebp -> mov 0(%ebx), %ebp + } + return 0; +} + +static u32 relax_tlsdesc_to_le(u8 *loc) { + switch ((loc[0] << 8) | loc[1]) { + case 0x8d83: return 0x90b8; // lea 0(%ebx), %eax -> mov $0, %eax + case 0x8d9b: return 0x90bb; // lea 0(%ebx), %ebx -> mov $0, %ebx + case 0x8d8b: return 0x90b9; // lea 0(%ebx), %ecx -> mov $0, %ecx + case 0x8d93: return 0x90ba; // lea 0(%ebx), %edx -> mov $0, %edx + case 0x8db3: return 0x90be; // lea 0(%ebx), %esi -> mov $0, %esi + case 0x8dbb: return 0x90bf; // lea 0(%ebx), %edi -> mov $0, %edi + case 0x8da3: return 0x90bc; // lea 0(%ebx), %esp -> mov $0, %esp + case 0x8dab: return 0x90bd; // lea 0(%ebx), %ebp -> mov $0, %ebp + } + return 0; +} + template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) @@ -294,7 +317,6 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { *(ul16 *)loc = S + A; break; case R_386_32: - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); break; case R_386_PC8: check(S + A - P, -(1 << 7), 1 << 7); @@ -346,7 +368,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { case R_386_TLS_LDM: if (ctx.got->has_tlsld(ctx)) *(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT; - else + else relax_ld_to_le(loc, rels[++i], ctx.tp_addr - ctx.tls_begin); break; case R_386_TLS_LDO_32: @@ -364,24 +386,41 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // call *(%eax) // R_386_TLS_DESC_CALL foo // - // We may relax the instructions to the following for non-dlopen'd DSO + // We may relax the instructions to the following if its TP-relative + // address is known at link-time // - // mov foo@GOTTPOFF(%ebx), %eax + // mov $foo@TPOFF, %eax // nop // - // or to the following for executable. + // or to the following if the TP-relative address is known at + // process startup time. // - // mov $foo@TPOFF, %eax + // mov foo@GOTTPOFF(%ebx), %eax // nop + // + // We allow the following alternative code sequence too because + // LLVM emits such code. + // + // lea 0(%ebx), %reg + // R_386_TLS_GOTDESC foo + // mov %reg, %eax + // call *(%eax) + // R_386_TLS_DESC_CALL foo if (sym.has_tlsdesc(ctx)) { *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) + A - GOT; } else if (sym.has_gottp(ctx)) { - loc[-2] = 0x8b; - loc[-1] = 0x83; + u32 insn = relax_tlsdesc_to_ie(loc - 2); + if (!insn) + Fatal(ctx) << *this << ": illegal instruction sequence for TLSDESC"; + loc[-2] = insn >> 8; + loc[-1] = insn; *(ul32 *)loc = sym.get_gottp_addr(ctx) + A - GOT; } else { - loc[-2] = 0x90; - loc[-1] = 0xb8; + u32 insn = relax_tlsdesc_to_le(loc - 2); + if (!insn) + Fatal(ctx) << *this << ": illegal instruction sequence for TLSDESC"; + loc[-2] = insn >> 8; + loc[-1] = insn; *(ul32 *)loc = S + A - ctx.tp_addr; } break; @@ -475,8 +514,6 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); std::span> rels = get_rels(ctx); // Scan relocations @@ -506,9 +543,6 @@ void InputSection::scan_relocations(Context &ctx) { case R_386_16: scan_absrel(ctx, sym, rel); break; - case R_386_32: - scan_dyn_absrel(ctx, sym, rel); - break; case R_386_PC8: case R_386_PC16: case R_386_PC32: @@ -538,8 +572,7 @@ void InputSection::scan_relocations(Context &ctx) { case R_386_TLS_GD: // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). - if ((ctx.arg.relax && sym.is_tprel_linktime_const(ctx)) || - ctx.arg.is_static) + if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) i++; else sym.flags |= NEEDS_TLSGD; @@ -547,7 +580,7 @@ void InputSection::scan_relocations(Context &ctx) { case R_386_TLS_LDM: // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). - if (ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared)) + if (ctx.arg.static_ || (ctx.arg.relax && !ctx.arg.shared)) i++; else ctx.needs_tlsld = true; @@ -558,6 +591,7 @@ void InputSection::scan_relocations(Context &ctx) { case R_386_TLS_LE: check_tlsle(ctx, sym, rel); break; + case R_386_32: case R_386_GOTOFF: case R_386_TLS_LDO_32: case R_386_SIZE32: @@ -569,4 +603,4 @@ void InputSection::scan_relocations(Context &ctx) { } } -} // namespace mold::elf +} // namespace mold diff --git a/elf/arch-loongarch.cc b/src/arch-loongarch.cc similarity index 53% rename from elf/arch-loongarch.cc rename to src/arch-loongarch.cc index 6942dfdb..dda138e9 100644 --- a/elf/arch-loongarch.cc +++ b/src/arch-loongarch.cc @@ -10,21 +10,22 @@ // bootstrapping the entire ecosystem for LoongArch, sending patches to // Linux, GCC, LLVM, etc. // -// All instructions are 4 bytes long in LoongArch and aligned to 4-byte -// boundaries. It has 32 general-purpose registers. Among these, $t0 - $t8 -// (aliases for $r12 - $r20) are temporary registers that we can use in -// our PLT and range extension thunks. +// Speaking of the ISA, all instructions are 4 byte long and aligned to 4 +// byte boundaries in LoongArch. It has 32 general-purpose registers. +// Among these, $t0 - $t8 (aliases for $r12 - $r20) are temporary +// registers that we can use in our PLT. // -// The psABI defines a few linker relaxations. We haven't supported them -// yet. +// Just like RISC-V, LoongArch supports section-shrinking relaxations. +// That is, it allows linkers to rewrite certain instruction sequences to +// shorter ones. Sections are not an atomic unit of copying. // -// https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html +// https://github.com/loongson/la-abi-specs/blob/release/laelf.adoc #if MOLD_LOONGARCH64 || MOLD_LOONGARCH32 #include "mold.h" -namespace mold::elf { +namespace mold { using E = MOLD_TARGET; @@ -50,7 +51,7 @@ static u64 hi20(u64 val, u64 pc) { return bits(page(val + 0x800) - page(pc), 31, 12); } -static u64 hi64(u64 val, u64 pc) { +static u64 higher20(u64 val, u64 pc) { // A PC-relative 64-bit address is materialized with the following // instructions for the large code model: // @@ -64,21 +65,15 @@ static u64 hi64(u64 val, u64 pc) { // ADDI.D adds a sign-extended 12 bit value to a register. LU32I.D and // LU52I.D simply set bits to [51:31] and to [63:53], respectively. // - // Compensating all the sign-extensions is a bit complicated. - u64 x = page(val) - page(pc); - if (val & 0x800) - x += 0x1000 - 0x1'0000'0000; - if (x & 0x8000'0000) - x += 0x1'0000'0000; - return x; -} - -static u64 higher20(u64 val, u64 pc) { - return bits(hi64(val, pc), 51, 32); + // Compensating all the sign-extensions is a bit complicated. The + // psABI gave the following formula. + val = val + 0x8000'0000 + ((val & 0x800) ? (0x1000 - 0x1'0000'0000) : 0); + return bits(page(val) - page(pc - 8), 51, 32); } static u64 highest12(u64 val, u64 pc) { - return bits(hi64(val, pc), 63, 52); + val = val + 0x8000'0000 + ((val & 0x800) ? (0x1000 - 0x1'0000'0000) : 0); + return bits(page(val) - page(pc - 12), 63, 52); } static void write_k12(u8 *loc, u32 val) { @@ -113,9 +108,47 @@ static void write_d10k16(u8 *loc, u32 val) { *(ul32 *)loc |= bits(val, 25, 16); } +static u32 get_rd(u32 insn) { + return bits(insn, 4, 0); +} + +static u32 get_rj(u32 insn) { + return bits(insn, 9, 5); +} + +static void set_rj(u8 *loc, u32 rj) { + assert(rj < 32); + *(ul32 *)loc &= 0b111111'1111111111111111'00000'11111; + *(ul32 *)loc |= rj << 5; +} + +// Returns true if isec's i'th relocation refers to the following +// relaxable instructioon pair. +// +// pcalau12i $t0, 0 # R_LARCH_GOT_PC_HI20 +// ld.d $t0, $t0, 0 # R_LARCH_GOT_PC_LO12 +static bool is_relaxable_got_load(Context &ctx, InputSection &isec, i64 i) { + std::span> rels = isec.get_rels(ctx); + Symbol &sym = *isec.file.symbols[rels[i].r_sym]; + + if (ctx.arg.relax && + sym.is_pcrel_linktime_const(ctx) && + i + 3 < rels.size() && + rels[i + 2].r_type == R_LARCH_GOT_PC_LO12 && + rels[i + 2].r_offset == rels[i].r_offset + 4 && + rels[i + 3].r_type == R_LARCH_RELAX) { + u32 insn1 = *(ul32 *)(isec.contents.data() + rels[i].r_offset); + u32 insn2 = *(ul32 *)(isec.contents.data() + rels[i].r_offset + 4); + bool is_ld_d = (insn2 & 0xffc0'0000) == 0x28c0'0000; + return get_rd(insn1) == get_rd(insn2) && get_rd(insn2) == get_rj(insn2) && + is_ld_d; + } + return false; +} + template <> void write_plt_header(Context &ctx, u8 *buf) { - static const ul32 insn_64[] = { + constexpr ul32 insn_64[] = { 0x1a00'000e, // pcalau12i $t2, %pc_hi20(.got.plt) 0x0011'bdad, // sub.d $t1, $t1, $t3 0x28c0'01cf, // ld.d $t3, $t2, %lo12(.got.plt) # _dl_runtime_resolve @@ -126,7 +159,7 @@ void write_plt_header(Context &ctx, u8 *buf) { 0x4c00'01e0, // jr $t3 }; - static const ul32 insn_32[] = { + constexpr ul32 insn_32[] = { 0x1a00'000e, // pcalau12i $t2, %pc_hi20(.got.plt) 0x0011'3dad, // sub.w $t1, $t1, $t3 0x2880'01cf, // ld.w $t3, $t2, %lo12(.got.plt) # _dl_runtime_resolve @@ -146,18 +179,18 @@ void write_plt_header(Context &ctx, u8 *buf) { write_k12(buf + 16, gotplt); } -static const ul32 plt_entry_64[] = { +constexpr ul32 plt_entry_64[] = { 0x1a00'000f, // pcalau12i $t3, %pc_hi20(func@.got.plt) 0x28c0'01ef, // ld.d $t3, $t3, %lo12(func@.got.plt) 0x4c00'01ed, // jirl $t1, $t3, 0 - 0x0340'0000, // nop + 0x002a'0000, // break }; -static const ul32 plt_entry_32[] = { +constexpr ul32 plt_entry_32[] = { 0x1a00'000f, // pcalau12i $t3, %pc_hi20(func@.got.plt) 0x2880'01ef, // ld.w $t3, $t3, %lo12(func@.got.plt) 0x4c00'01ed, // jirl $t1, $t3, 0 - 0x0340'0000, // nop + 0x002a'0000, // break }; template <> @@ -233,10 +266,9 @@ template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); + auto get_r_delta = [&](i64 idx) { + return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx]; + }; for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; @@ -247,7 +279,9 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { continue; Symbol &sym = *file.symbols[rel.r_sym]; - u8 *loc = base + rel.r_offset; + i64 r_offset = rel.r_offset - get_r_delta(i); + i64 removed_bytes = get_r_delta(i + 1) - get_r_delta(i); + u8 *loc = base + r_offset; auto check = [&](i64 val, i64 lo, i64 hi) { if (val < lo || hi <= val) @@ -268,32 +302,28 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // ones. Therefore, G may refer to a TLSGD or a regular GOT slot // depending on the symbol type. // - // Note that as of August 2023, both GCC and Clang treat TLSLD relocs - // as if they were TLSGD relocs for LoongArch, which is a clear bug. - // We need to handle TLSLD relocs as synonyms for TLSGD relocs for the - // sake of bug compatibility. - auto get_got_idx = [&] { - if (sym.has_tlsgd(ctx)) - return sym.get_tlsgd_idx(ctx); - return sym.get_got_idx(ctx); - }; + // Note that even though LoongArch defines relocations for TLSLD, TLSLD + // is not actually supported on it. GCC and LLVM emit identical machine + // code for -ftls-model=global-dynamic and -ftls-model=local-dynamic, + // and we need to handle TLSLD relocations as equivalent to TLSGD + // relocations. This is clearly a compiler bug, but it's too late to + // fix. The only way to fix it would be to define a new set of + // relocations for true TLSLD and deprecate the current ones. But it + // appears that migrating to TLSDESC is a better choice, so it's + // unlikely to happen. + i64 got_idx = + sym.has_tlsgd(ctx) ? sym.get_tlsgd_idx(ctx) : sym.get_got_idx(ctx); u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; - u64 P = get_addr() + rel.r_offset; - u64 G = get_got_idx() * sizeof(Word); + u64 P = get_addr() + r_offset; + u64 G = got_idx * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; switch (rel.r_type) { case R_LARCH_32: if constexpr (E::is_64) *(ul32 *)loc = S + A; - else - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); - break; - case R_LARCH_64: - assert(E::is_64); - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); break; case R_LARCH_B16: check_branch(S + A - P, -(1 << 17), 1 << 17); @@ -303,13 +333,10 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { check_branch(S + A - P, -(1 << 22), 1 << 22); write_d5k16(loc, (S + A - P) >> 2); break; - case R_LARCH_B26: { - i64 val = S + A - P; - if (val < -(1 << 27) || (1 << 27) <= val) - val = get_thunk_addr(i) + A - P; - write_d10k16(loc, val >> 2); + case R_LARCH_B26: + check_branch(S + A - P, -(1 << 27), 1 << 27); + write_d10k16(loc, (S + A - P) >> 2); break; - } case R_LARCH_ABS_LO12: write_k12(loc, S + A); break; @@ -333,7 +360,15 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { write_k12(loc, S + A); break; case R_LARCH_PCALA_HI20: - write_j20(loc, hi20(S + A, P)); + if (removed_bytes == 0) { + write_j20(loc, hi20(S + A, P)); + } else { + // Rewrite pcalau12i + addi.d with pcaddi + assert(removed_bytes == 4); + *(ul32 *)loc = 0x1800'0000 | get_rd(*(ul32 *)loc); // pcaddi + write_j20(loc, (S + A - P) >> 2); + i += 3; + } break; case R_LARCH_PCALA64_LO20: write_j20(loc, higher20(S + A, P)); @@ -345,7 +380,37 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { write_k12(loc, GOT + G + A); break; case R_LARCH_GOT_PC_HI20: - write_j20(loc, hi20(GOT + G + A, P)); + if (removed_bytes == 0) { + // If the PC-relative symbol address is known at link-time, we can + // rewrite the following GOT load + // + // pcalau12i $t0, 0 # R_LARCH_GOT_PC_HI20 + // ld.d $t0, $t0, 0 # R_LARCH_GOT_PC_LO12 + // + // with the following address materialization + // + // pcalau12i $t0, 0 + // addi.d $t0, $t0, 0 + if (is_relaxable_got_load(ctx, *this, i)) { + i64 dist = compute_distance(ctx, sym, *this, rel); + if (-(1LL << 31) <= dist && dist < (1LL << 31)) { + u32 rd = get_rd(*(ul32 *)loc); + *(ul32 *)(loc + 4) = 0x02c0'0000 | (rd << 5) | rd; // addi.d + + write_j20(loc, hi20(S + A, P)); + write_k12(loc + 4, S + A); + i += 3; + break; + } + } + write_j20(loc, hi20(GOT + G + A, P)); + } else { + // Rewrite pcalau12i + ld.d with pcaddi + assert(removed_bytes == 4); + *(ul32 *)loc = 0x1800'0000 | get_rd(*(ul32 *)loc); // pcaddi + write_j20(loc, (S + A - P) >> 2); + i += 3; + } break; case R_LARCH_GOT64_PC_LO20: write_j20(loc, higher20(GOT + G + A, P)); @@ -401,13 +466,13 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { case R_LARCH_TLS_IE64_HI12: write_k12(loc, (sym.get_gottp_addr(ctx) + A) >> 52); break; - case R_LARCH_TLS_LD_PC_HI20: case R_LARCH_TLS_GD_PC_HI20: + case R_LARCH_TLS_LD_PC_HI20: check(sym.get_tlsgd_addr(ctx) + A - P, -(1LL << 31), 1LL << 31); write_j20(loc, hi20(sym.get_tlsgd_addr(ctx) + A, P)); break; - case R_LARCH_TLS_LD_HI20: case R_LARCH_TLS_GD_HI20: + case R_LARCH_TLS_LD_HI20: write_j20(loc, (sym.get_tlsgd_addr(ctx) + A) >> 12); break; case R_LARCH_ADD6: @@ -446,12 +511,148 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { case R_LARCH_64_PCREL: *(ul64 *)loc = S + A - P; break; + case R_LARCH_CALL36: + if (removed_bytes == 0) { + write_j20(loc, (S + A - P + 0x20000) >> 18); + write_k16(loc + 4, (S + A - P) >> 2); + } else { + // Rewrite PCADDU18I + JIRL to B or BL + assert(removed_bytes == 4); + if (get_rd(*(ul32 *)(contents.data() + rel.r_offset + 4)) == 0) + *(ul32 *)loc = 0x5000'0000; // B + else + *(ul32 *)loc = 0x5400'0000; // BL + write_d10k16(loc, (S + A - P) >> 2); + } + break; case R_LARCH_ADD_ULEB128: overwrite_uleb(loc, read_uleb(loc) + S + A); break; case R_LARCH_SUB_ULEB128: overwrite_uleb(loc, read_uleb(loc) - S - A); break; + case R_LARCH_TLS_DESC_PC_HI20: + // LoongArch TLSDESC uses the following code sequence to materialize + // a TP-relative address in a0. + // + // pcalau12i $a0, 0 + // R_LARCH_TLS_DESC_PC_HI20 foo + // addi.[dw] $a0, $a0, 0 + // R_LARCH_TLS_DESC_PC_LO12 foo + // ld.d $ra, $a0, 0 + // R_LARCH_TLS_DESC_LD foo + // jirl $ra, $ra, 0 + // R_LARCH_TLS_DESC_CALL foo + // + // We may relax the instructions to the following if its TP-relative + // address is known at link-time + // + // + // + // lu12i.w $a0, foo@TPOFF + // addi.w $a0, $a0, foo@TPOFF + // + // or to the following if the TP offset is small enough. + // + // + // + // + // ori $a0, $zero, foo@TPOFF + // + // If the TP-relative address is known at process startup time, we + // may relax the instructions to the following. + // + // + // + // pcalau12i $a0, foo@GOTTP + // ld.[dw] $a0, $a0, foo@GOTTP + // + // If we don't know anything about the symbol, we can still relax + // the first two instructions to a single pcaddi as shown below. + // + // + // pcaddi $a0, foo@GOTDESC + // ld.d $ra, $a0, 0 + // jirl $ra, $ra, 0 + // + // Note that if section-shrinking relaxation is enabled, nop may be + // completely deleted. + if (removed_bytes == 0) { + if (sym.has_tlsdesc(ctx)) { + i64 dist = sym.get_tlsdesc_addr(ctx) + A - P; + if (ctx.arg.relax && -(1 << 21) <= dist && dist < (1 << 21)) { + *(ul32 *)loc = 0x0340'0000; // nop + } else { + write_j20(loc, hi20(sym.get_tlsdesc_addr(ctx) + A, P)); + } + } else { + *(ul32 *)loc = 0x0340'0000; // nop + } + } + break; + case R_LARCH_TLS_DESC_PC_LO12: + if (removed_bytes == 0) { + if (sym.has_tlsdesc(ctx)) { + i64 dist = sym.get_tlsdesc_addr(ctx) + A - P; + if (ctx.arg.relax && -(1 << 21) <= dist && dist < (1 << 21)) { + // If we can directly materialize the PC-relative address + // with pcaddi, do that. + *(ul32 *)loc = 0x1800'0000 | get_rd(*(ul32 *)loc); // pcaddi + write_j20(loc, dist >> 2); + } else { + write_k12(loc, sym.get_tlsdesc_addr(ctx) + A); + } + } else { + *(ul32 *)loc = 0x0340'0000; // nop + } + } + break; + case R_LARCH_TLS_DESC_LD: + if (sym.has_tlsdesc(ctx) || removed_bytes == 4) { + // Do nothing + } else if (sym.has_gottp(ctx)) { + *(ul32 *)loc = 0x1a00'0004; // pcalau12i $a0, 0 + write_j20(loc, hi20(sym.get_gottp_addr(ctx) + A, P)); + } else { + *(ul32 *)loc = 0x1400'0004; // lu12i.w $a0, 0 + write_j20(loc, (S + A + 0x800 - ctx.tp_addr) >> 12); + } + break; + case R_LARCH_TLS_DESC_CALL: + if (sym.has_tlsdesc(ctx)) { + // Do nothing + } else if (sym.has_gottp(ctx)) { + if (E::is_64) + *(ul32 *)loc = 0x28c0'0084; // ld.d $a0, $a0, 0 + else + *(ul32 *)loc = 0x2880'0084; // ld.w $a0, $a0, 0 + write_k12(loc, sym.get_gottp_addr(ctx) + A); + } else { + i64 val = S + A - ctx.tp_addr; + if (val < 0x1000) + *(ul32 *)loc = 0x0380'0004; // ori $a0, $zero, 0 + else + *(ul32 *)loc = 0x0280'0084; // addi.w $a0, $a0, 0 + write_k12(loc, val); + } + break; + case R_LARCH_TLS_LE_HI20_R: + if (removed_bytes == 0) + write_j20(loc, (S + A + 0x800 - ctx.tp_addr) >> 12); + break; + case R_LARCH_TLS_LE_LO12_R: { + i64 val = S + A - ctx.tp_addr; + write_k12(loc, val); + + // Rewrite `addi.d $t0, $t0, ` with `addi.d $t0, $tp, ` + // if the offset is directly accessible using tp. tp is r2. + if (sign_extend(val, 11) == val) + set_rj(loc, 2); + break; + } + case R_LARCH_64: + case R_LARCH_TLS_LE_ADD_R: + break; default: unreachable(); } @@ -551,8 +752,6 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); std::span> rels = get_rels(ctx); // Scan relocations @@ -576,15 +775,10 @@ void InputSection::scan_relocations(Context &ctx) { case R_LARCH_32: if constexpr (E::is_64) scan_absrel(ctx, sym, rel); - else - scan_dyn_absrel(ctx, sym, rel); - break; - case R_LARCH_64: - assert(E::is_64); - scan_dyn_absrel(ctx, sym, rel); break; case R_LARCH_B26: case R_LARCH_PCALA_HI20: + case R_LARCH_CALL36: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; @@ -596,10 +790,10 @@ void InputSection::scan_relocations(Context &ctx) { case R_LARCH_TLS_IE_PC_HI20: sym.flags |= NEEDS_GOTTP; break; - case R_LARCH_TLS_LD_PC_HI20: case R_LARCH_TLS_GD_PC_HI20: - case R_LARCH_TLS_LD_HI20: + case R_LARCH_TLS_LD_PC_HI20: case R_LARCH_TLS_GD_HI20: + case R_LARCH_TLS_LD_HI20: sym.flags |= NEEDS_TLSGD; break; case R_LARCH_32_PCREL: @@ -610,8 +804,14 @@ void InputSection::scan_relocations(Context &ctx) { case R_LARCH_TLS_LE_LO12: case R_LARCH_TLS_LE64_LO20: case R_LARCH_TLS_LE64_HI12: + case R_LARCH_TLS_LE_HI20_R: + case R_LARCH_TLS_LE_LO12_R: check_tlsle(ctx, sym, rel); break; + case R_LARCH_TLS_DESC_CALL: + scan_tlsdesc(ctx, sym); + break; + case R_LARCH_64: case R_LARCH_B16: case R_LARCH_B21: case R_LARCH_ABS_HI20: @@ -645,6 +845,10 @@ void InputSection::scan_relocations(Context &ctx) { case R_LARCH_SUB64: case R_LARCH_ADD_ULEB128: case R_LARCH_SUB_ULEB128: + case R_LARCH_TLS_DESC_PC_HI20: + case R_LARCH_TLS_DESC_PC_LO12: + case R_LARCH_TLS_DESC_LD: + case R_LARCH_TLS_LE_ADD_R: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; @@ -653,29 +857,157 @@ void InputSection::scan_relocations(Context &ctx) { } template <> -void RangeExtensionThunk::copy_buf(Context &ctx) { - static const ul32 insn[] = { - 0x1e00'000c, // pcaddu18i $t0, 0 - 0x4c00'0180, // jirl $zero, $t0, 0 - }; +void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { + std::span> rels = isec.get_rels(ctx); + isec.extra.r_deltas.resize(rels.size() + 1); + i64 delta = 0; - static_assert(E::thunk_size == sizeof(insn)); - - u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset; - u64 P = output_section.shdr.sh_addr + offset; + for (i64 i = 0; i < rels.size(); i++) { + const ElfRel &r = rels[i]; + Symbol &sym = *isec.file.symbols[r.r_sym]; + isec.extra.r_deltas[i] = delta; + + // A R_LARCH_ALIGN relocation refers to the beginning of a nop + // sequence. We need to remove some or all of them so that the + // instruction that immediately follows that is aligned to a specified + // boundary. To allow that, a R_LARCH_ALIGN relocation that requests + // 2^n alignment refers to 2^n - 4 bytes of nop instructions. + if (r.r_type == R_LARCH_ALIGN) { + // The actual rule for storing the alignment size is a bit weird. + // In particular, the most significant 56 bits of r_addend is + // sometimes used to store the upper limit of the alignment, + // allowing the instruction that follows nops _not_ to be aligned at + // all. I think that's a spec bug, so we don't want to support that. + i64 alignment; + if (r.r_sym) { + if (r.r_addend >> 8) + Fatal(ctx) << isec << ": ternary R_LARCH_ALIGN is not supported: " << i; + alignment = 1 << r.r_addend; + } else { + if (!has_single_bit(r.r_addend + 4)) + Fatal(ctx) << isec << ": R_LARCH_ALIGN: invalid alignment requirement: " + << i; + alignment = r.r_addend + 4; + } + + u64 loc = isec.get_addr() + r.r_offset - delta; + u64 next_loc = loc + alignment - 4; + delta += next_loc - align_to(loc, alignment); + continue; + } - for (Symbol *sym : symbols) { - u64 S = sym->get_addr(ctx); + // Handling other relocations is optional. + if (!ctx.arg.relax || i == rels.size() - 1 || + rels[i + 1].r_type != R_LARCH_RELAX) + continue; - memcpy(buf, insn, sizeof(insn)); - write_j20(buf, (S - P + 0x20000) >> 18); - write_k16(buf + 4, (S - P) >> 2); + // Skip linker-synthesized symbols because their final addresses + // are not fixed yet. + if (sym.file == ctx.internal_obj) + continue; - buf += sizeof(insn); - P += sizeof(insn); + switch (r.r_type) { + case R_LARCH_TLS_LE_HI20_R: + case R_LARCH_TLS_LE_ADD_R: + // LoongArch uses the following three instructions to access + // TP ± 2 GiB. + // + // lu12i.w $t0, 0 # R_LARCH_TLS_LE_HI20_R + // add.d $t0, $t0, $tp # R_LARCH_TLS_LE_ADD_R + // addi.d $t0, $t0, 0 # R_LARCH_TLS_LE_LO12_R + // + // If the thread-local variable is within TP ± 2 KiB, we can + // relax them into the following single instruction. + // + // addi.d $t0, $tp, + if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr; + sign_extend(val, 11) == val) + delta += 4; + break; + case R_LARCH_PCALA_HI20: + // The following two instructions are used to materialize a + // PC-relative address with a 32 bit displacement. + // + // pcalau12i $t0, 0 # R_LARCH_PCALA_HI20 + // addi.d $t0, $t0, 0 # R_LARCH_PCALA_LO12 + // + // If the displacement is within ±2 MiB, we can relax them to + // the following instruction. + // + // pcaddi $t0, + if (i + 3 < rels.size() && + rels[i + 2].r_type == R_LARCH_PCALA_LO12 && + rels[i + 2].r_offset == rels[i].r_offset + 4 && + rels[i + 3].r_type == R_LARCH_RELAX) { + i64 dist = compute_distance(ctx, sym, isec, r); + u32 insn1 = *(ul32 *)(isec.contents.data() + rels[i].r_offset); + u32 insn2 = *(ul32 *)(isec.contents.data() + rels[i].r_offset + 4); + bool is_addi_d = (insn2 & 0xffc0'0000) == 0x02c0'0000; + + if (dist % 4 == 0 && -(1 << 21) <= dist && dist < (1 << 21) && + is_addi_d && get_rd(insn1) == get_rd(insn2) && + get_rd(insn2) == get_rj(insn2)) + delta += 4; + } + break; + case R_LARCH_CALL36: + // A CALL36 relocation referes to the following instruction pair + // to jump to PC ± 128 GiB. + // + // pcaddu18i $t0, 0 # R_LARCH_CALL36 + // jirl $zero/$ra, $t0, 0 + // + // If the displacement is PC ± 128 MiB, we can use B or BL instead. + // Note that $zero is $r0 and $ra is $r1. + if (i64 dist = compute_distance(ctx, sym, isec, r); + -(1 << 27) <= dist && dist < (1 << 27)) + if (u32 jirl = *(ul32 *)(isec.contents.data() + rels[i].r_offset + 4); + get_rd(jirl) == 0 || get_rd(jirl) == 1) + delta += 4; + break; + case R_LARCH_GOT_PC_HI20: + // The following two instructions are used to load a symbol address + // from the GOT. + // + // pcalau12i $t0, 0 # R_LARCH_GOT_PC_HI20 + // ld.d $t0, $t0, 0 # R_LARCH_GOT_PC_LO12 + // + // If the PC-relative symbol address is known at link-time, we can + // relax them to the following instruction. + // + // pcaddi $t0, + if (is_relaxable_got_load(ctx, isec, i)) { + i64 dist = compute_distance(ctx, sym, isec, r); + if (dist % 4 == 0 && -(1 << 21) <= dist && dist < (1 << 21)) + delta += 4; + } + break; + case R_LARCH_TLS_DESC_PC_HI20: + if (sym.has_tlsdesc(ctx)) { + u64 P = isec.get_addr() + r.r_offset; + i64 dist = sym.get_tlsdesc_addr(ctx) + r.r_addend - P; + if (-(1 << 21) <= dist && dist < (1 << 21)) + delta += 4; + } else { + delta += 4; + } + break; + case R_LARCH_TLS_DESC_PC_LO12: + if (!sym.has_tlsdesc(ctx)) + delta += 4; + break; + case R_LARCH_TLS_DESC_LD: + if (!sym.has_tlsdesc(ctx) && !sym.has_gottp(ctx) && + sym.get_addr(ctx) + r.r_addend - ctx.tp_addr < 0x1000) + delta += 4; + break; + } } + + isec.extra.r_deltas[rels.size()] = delta; + isec.sh_size -= delta; } -} // namespace mold::elf +} // namespace mold #endif diff --git a/elf/arch-m68k.cc b/src/arch-m68k.cc similarity index 95% rename from elf/arch-m68k.cc rename to src/arch-m68k.cc index f9de3be0..edffe048 100644 --- a/elf/arch-m68k.cc +++ b/src/arch-m68k.cc @@ -16,7 +16,7 @@ #include "mold.h" -namespace mold::elf { +namespace mold { using E = M68K; @@ -78,11 +78,6 @@ template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) @@ -126,7 +121,6 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { switch (rel.r_type) { case R_68K_32: - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); break; case R_68K_16: write16(S + A); @@ -251,8 +245,6 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { @@ -266,9 +258,6 @@ void InputSection::scan_relocations(Context &ctx) { Error(ctx) << sym << ": GNU ifunc symbol is not supported on m68k"; switch (rel.r_type) { - case R_68K_32: - scan_dyn_absrel(ctx, sym, rel); - break; case R_68K_16: case R_68K_8: scan_absrel(ctx, sym, rel); @@ -312,6 +301,7 @@ void InputSection::scan_relocations(Context &ctx) { case R_68K_TLS_LE8: check_tlsle(ctx, sym, rel); break; + case R_68K_32: case R_68K_TLS_LDO32: case R_68K_TLS_LDO16: case R_68K_TLS_LDO8: @@ -322,4 +312,4 @@ void InputSection::scan_relocations(Context &ctx) { } } -} // namespace mold::elf +} // namespace mold diff --git a/elf/arch-ppc32.cc b/src/arch-ppc32.cc similarity index 95% rename from elf/arch-ppc32.cc rename to src/arch-ppc32.cc index 380c7e6b..4525e73d 100644 --- a/elf/arch-ppc32.cc +++ b/src/arch-ppc32.cc @@ -42,7 +42,7 @@ #include "mold.h" -namespace mold::elf { +namespace mold { using E = PPC32; @@ -54,7 +54,7 @@ static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; } template <> void write_plt_header(Context &ctx, u8 *buf) { - static const ub32 insn[] = { + constexpr ub32 insn[] = { // Get the address of this PLT section 0x7c08'02a6, // mflr r0 0x429f'0005, // bcl 20, 31, 4 @@ -88,7 +88,7 @@ void write_plt_header(Context &ctx, u8 *buf) { loc[5] |= lo(ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr + 4); } -static const ub32 plt_entry[] = { +constexpr ub32 plt_entry[] = { // Get the address of this PLT entry 0x7c08'02a6, // mflr r0 0x429f'0005, // bcl 20, 31, 4 @@ -148,11 +148,6 @@ template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - u64 GOT2 = file.extra.got2 ? file.extra.got2->get_addr() : 0; for (i64 i = 0; i < rels.size(); i++) { @@ -170,10 +165,6 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { u64 GOT = ctx.got->shdr.sh_addr; switch (rel.r_type) { - case R_PPC_ADDR32: - case R_PPC_UADDR32: - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); - break; case R_PPC_ADDR14: *(ub32 *)loc |= bits(S + A, 15, 2) << 2; break; @@ -275,6 +266,8 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { case R_PPC_GOT_TPREL16: *(ub16 *)loc = sym.get_gottp_addr(ctx) - GOT; break; + case R_PPC_ADDR32: + case R_PPC_UADDR32: case R_PPC_TLS: case R_PPC_TLSGD: case R_PPC_TLSLD: @@ -323,8 +316,6 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); std::span> rels = get_rels(ctx); // Scan relocations @@ -339,10 +330,6 @@ void InputSection::scan_relocations(Context &ctx) { sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { - case R_PPC_ADDR32: - case R_PPC_UADDR32: - scan_dyn_absrel(ctx, sym, rel); - break; case R_PPC_ADDR14: case R_PPC_ADDR16: case R_PPC_UADDR16: @@ -391,6 +378,8 @@ void InputSection::scan_relocations(Context &ctx) { case R_PPC_TPREL16_HA: check_tlsle(ctx, sym, rel); break; + case R_PPC_ADDR32: + case R_PPC_UADDR32: case R_PPC_LOCAL24PC: case R_PPC_TLS: case R_PPC_TLSGD: @@ -408,8 +397,8 @@ void InputSection::scan_relocations(Context &ctx) { } template <> -void RangeExtensionThunk::copy_buf(Context &ctx) { - static const ub32 local_thunk[] = { +void Thunk::copy_buf(Context &ctx) { + constexpr ub32 local_thunk[] = { // Get this thunk's address 0x7c08'02a6, // mflr r0 0x429f'0005, // bcl 20, 31, 4 @@ -450,4 +439,4 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { } } -} // namespace mold::elf +} // namespace mold diff --git a/elf/arch-ppc64v1.cc b/src/arch-ppc64v1.cc similarity index 95% rename from elf/arch-ppc64v1.cc rename to src/arch-ppc64v1.cc index 806c3fc7..e3ec1c55 100644 --- a/elf/arch-ppc64v1.cc +++ b/src/arch-ppc64v1.cc @@ -50,7 +50,7 @@ #include #include -namespace mold::elf { +namespace mold { using E = PPC64V1; @@ -68,7 +68,7 @@ static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; } // resolved addresses. template <> void write_plt_header(Context &ctx, u8 *buf) { - static const ub32 insn[] = { + constexpr ub32 insn[] = { 0x7d88'02a6, // mflr r12 0x429f'0005, // bcl 20, 31, 4 // obtain PC 0x7d68'02a6, // mflr r11 @@ -101,7 +101,7 @@ void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { // call to the PLT entry jumps to. So we need to strictly follow the PLT // section layout as the loader expect it to be. if (idx < 0x8000) { - static const ub32 insn[] = { + constexpr ub32 insn[] = { 0x3800'0000, // li r0, PLT_INDEX 0x4b00'0000, // b plt0 }; @@ -110,7 +110,7 @@ void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { loc[0] |= idx; loc[1] |= (ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx) - 4) & 0x00ff'ffff; } else { - static const ub32 insn[] = { + constexpr ub32 insn[] = { 0x3c00'0000, // lis r0, PLT_INDEX@high 0x6000'0000, // ori r0, r0, PLT_INDEX@lo 0x4b00'0000, // b plt0 @@ -154,11 +154,6 @@ template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) @@ -182,11 +177,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { u64 TOC = ctx.extra.TOC->value; switch (rel.r_type) { - case R_PPC64_ADDR64: - apply_toc_rel(ctx, sym, rel, loc, S, A, P, &dynrel); - break; case R_PPC64_TOC: - apply_toc_rel(ctx, *ctx.extra.TOC, rel, loc, TOC, A, P, &dynrel); break; case R_PPC64_TOC16_HA: *(ub16 *)loc = ha(S + A - TOC); @@ -262,15 +253,22 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { case R_PPC64_DTPREL16_LO: *(ub16 *)loc = lo(S + A - ctx.dtp_addr); break; + case R_PPC64_DTPREL16_LO_DS: + *(ub16 *)loc |= (S + A - ctx.dtp_addr) & 0xfffc; + break; case R_PPC64_TPREL16_HA: *(ub16 *)loc = ha(S + A - ctx.tp_addr); break; case R_PPC64_TPREL16_LO: *(ub16 *)loc = lo(S + A - ctx.tp_addr); break; + case R_PPC64_TPREL16_LO_DS: + *(ub16 *)loc |= (S + A - ctx.tp_addr) & 0xfffc; + break; case R_PPC64_GOT_TPREL16_LO_DS: *(ub16 *)loc |= (sym.get_gottp_addr(ctx) - TOC) & 0xfffc; break; + case R_PPC64_ADDR64: case R_PPC64_PLTSEQ: case R_PPC64_PLTCALL: case R_PPC64_TLS: @@ -335,8 +333,6 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); std::span> rels = get_rels(ctx); // Scan relocations @@ -356,10 +352,6 @@ void InputSection::scan_relocations(Context &ctx) { sym.flags |= NEEDS_PPC_OPD; switch (rel.r_type) { - case R_PPC64_ADDR64: - case R_PPC64_TOC: - scan_toc_rel(ctx, sym, rel); - break; case R_PPC64_GOT_TPREL16_HA: sym.flags |= NEEDS_GOTTP; break; @@ -378,8 +370,11 @@ void InputSection::scan_relocations(Context &ctx) { break; case R_PPC64_TPREL16_HA: case R_PPC64_TPREL16_LO: + case R_PPC64_TPREL16_LO_DS: check_tlsle(ctx, sym, rel); break; + case R_PPC64_ADDR64: + case R_PPC64_TOC: case R_PPC64_REL32: case R_PPC64_REL64: case R_PPC64_TOC16_HA: @@ -401,6 +396,7 @@ void InputSection::scan_relocations(Context &ctx) { case R_PPC64_TLSLD: case R_PPC64_DTPREL16_HA: case R_PPC64_DTPREL16_LO: + case R_PPC64_DTPREL16_LO_DS: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; @@ -409,11 +405,11 @@ void InputSection::scan_relocations(Context &ctx) { } template <> -void RangeExtensionThunk::copy_buf(Context &ctx) { +void Thunk::copy_buf(Context &ctx) { // If the destination is .plt.got, we save the current r2, read an // address of a function descriptor from .got, restore %r2 and jump // to the function. - static const ub32 pltgot_thunk[] = { + constexpr ub32 pltgot_thunk[] = { // Store the caller's %r2 0xf841'0028, // std %r2, 40(%r1) @@ -431,7 +427,7 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { }; // If the destination is .plt, read a function descriptor from .got.plt. - static const ub32 plt_thunk[] = { + constexpr ub32 plt_thunk[] = { // Store the caller's %r2 0xf841'0028, // std %r2, 40(%r1) @@ -450,7 +446,7 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { // If the destination is a non-imported function, we directly jump // to the function entry address. - static const ub32 local_thunk[] = { + constexpr ub32 local_thunk[] = { 0x3d82'0000, // addis r12, r2, foo@toc@ha 0x398c'0000, // addi r12, r12, foo@toc@lo 0x7d89'03a6, // mtctr r12 @@ -638,15 +634,14 @@ void ppc64v1_scan_symbols(Context &ctx) { }); // Functions referenced by the ELF header also have to have .opd entries. - auto mark = [&](std::string_view name) { - if (!name.empty()) - if (Symbol &sym = *get_symbol(ctx, name); !sym.is_imported) - sym.flags |= NEEDS_PPC_OPD; - }; + if (!ctx.arg.entry->is_imported) + ctx.arg.entry->flags |= NEEDS_PPC_OPD; + + if (!ctx.arg.init->is_imported) + ctx.arg.init->flags |= NEEDS_PPC_OPD; - mark(ctx.arg.entry); - mark(ctx.arg.init); - mark(ctx.arg.fini); + if (!ctx.arg.fini->is_imported) + ctx.arg.fini->flags |= NEEDS_PPC_OPD; } void PPC64OpdSection::add_symbol(Context &ctx, Symbol *sym) { @@ -682,4 +677,4 @@ void PPC64OpdSection::copy_buf(Context &ctx) { } } -} // namespace mold::elf +} // namespace mold diff --git a/elf/arch-ppc64v2.cc b/src/arch-ppc64v2.cc similarity index 76% rename from elf/arch-ppc64v2.cc rename to src/arch-ppc64v2.cc index 7b6f5335..78456fdb 100644 --- a/elf/arch-ppc64v2.cc +++ b/src/arch-ppc64v2.cc @@ -82,7 +82,7 @@ #include "mold.h" -namespace mold::elf { +namespace mold { using E = PPC64V2; @@ -106,7 +106,7 @@ static void write34(u8 *loc, u64 x) { // resolved addresses. template <> void write_plt_header(Context &ctx, u8 *buf) { - static const ul32 insn[] = { + constexpr ul32 insn[] = { // Get PC 0x7c08'02a6, // mflr r0 0x429f'0005, // bcl 20, 31, 4 // obtain PC @@ -186,11 +186,6 @@ template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) @@ -210,12 +205,6 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { auto no_r2save_thunk_addr = [&] { return get_thunk_addr(i) + 8; }; switch (rel.r_type) { - case R_PPC64_ADDR64: - if (name() == ".toc") - apply_toc_rel(ctx, sym, rel, loc, S, A, P, &dynrel); - else - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); - break; case R_PPC64_TOC16_HA: *(ul16 *)loc = ha(S + A - TOC); break; @@ -319,6 +308,9 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { case R_PPC64_DTPREL16_LO: *(ul16 *)loc = lo(S + A - ctx.dtp_addr); break; + case R_PPC64_DTPREL16_LO_DS: + *(ul16 *)loc |= (S + A - ctx.dtp_addr) & 0xfffc; + break; case R_PPC64_DTPREL34: write34(loc, S + A - ctx.dtp_addr); break; @@ -328,9 +320,13 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { case R_PPC64_TPREL16_LO: *(ul16 *)loc = lo(S + A - ctx.tp_addr); break; + case R_PPC64_TPREL16_LO_DS: + *(ul16 *)loc |= (S + A - ctx.tp_addr) & 0xfffc; + break; case R_PPC64_TPREL34: write34(loc, S + A - ctx.tp_addr); break; + case R_PPC64_ADDR64: case R_PPC64_PLTSEQ: case R_PPC64_PLTSEQ_NOTOC: case R_PPC64_PLTCALL: @@ -397,8 +393,6 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); std::span> rels = get_rels(ctx); // Scan relocations @@ -413,12 +407,6 @@ void InputSection::scan_relocations(Context &ctx) { sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { - case R_PPC64_ADDR64: - if (name() == ".toc") - scan_toc_rel(ctx, sym, rel); - else - scan_dyn_absrel(ctx, sym, rel); - break; case R_PPC64_GOT_TPREL16_HA: case R_PPC64_GOT_TPREL_PCREL34: sym.flags |= NEEDS_GOTTP; @@ -448,9 +436,11 @@ void InputSection::scan_relocations(Context &ctx) { break; case R_PPC64_TPREL16_HA: case R_PPC64_TPREL16_LO: + case R_PPC64_TPREL16_LO_DS: case R_PPC64_TPREL34: check_tlsle(ctx, sym, rel); break; + case R_PPC64_ADDR64: case R_PPC64_REL32: case R_PPC64_REL64: case R_PPC64_TOC16_HA: @@ -475,6 +465,7 @@ void InputSection::scan_relocations(Context &ctx) { case R_PPC64_TLSLD: case R_PPC64_DTPREL16_HA: case R_PPC64_DTPREL16_LO: + case R_PPC64_DTPREL16_LO_DS: case R_PPC64_DTPREL34: break; default: @@ -484,10 +475,10 @@ void InputSection::scan_relocations(Context &ctx) { } template <> -void RangeExtensionThunk::copy_buf(Context &ctx) { +void Thunk::copy_buf(Context &ctx) { // If the destination is PLT, we read an address from .got.plt or .got // and jump there. - static const ul32 plt_thunk[] = { + constexpr ul32 plt_thunk[] = { 0xf841'0018, // std r2, 24(r1) 0x6000'0000, // nop 0x3d82'0000, // addis r12, r2, foo@gotplt@toc@ha @@ -496,7 +487,7 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { 0x4e80'0420, // bctr }; - static const ul32 plt_thunk_power10[] = { + constexpr ul32 plt_thunk_power10[] = { 0xf841'0018, // std r2, 24(r1) 0x6000'0000, // nop 0x0410'0000, // pld r12, foo@gotplt@pcrel @@ -507,7 +498,7 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { // If the destination is a non-imported function, we directly jump // to its local entry point. - static const ul32 local_thunk[] = { + constexpr ul32 local_thunk[] = { 0xf841'0018, // std r2, 24(r1) 0x6000'0000, // nop 0x3d82'0000, // addis r12, r2, foo@toc@ha @@ -516,7 +507,7 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { 0x4e80'0420, // bctr }; - static const ul32 local_thunk_power10[] = { + constexpr ul32 local_thunk_power10[] = { 0xf841'0018, // std r2, 24(r1) 0x6000'0000, // nop 0x0610'0000, // pla r12, foo@pcrel @@ -564,9 +555,109 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { } } +// GCC may emit references to the following functions in function prologue +// and epiilogue if -Os is specified. For some reason, these functions are +// not in libgcc.a and expected to be synthesized by the linker. +const std::vector> +ppc64_save_restore_insns = { + { "_savegpr0_14", 0xf9c1ff70 }, // std r14,-144(r1) + { "_savegpr0_15", 0xf9e1ff78 }, // std r15,-136(r1) + { "_savegpr0_16", 0xfa01ff80 }, // std r16,-128(r1) + { "_savegpr0_17", 0xfa21ff88 }, // std r17,-120(r1) + { "_savegpr0_18", 0xfa41ff90 }, // std r18,-112(r1) + { "_savegpr0_19", 0xfa61ff98 }, // std r19,-104(r1) + { "_savegpr0_20", 0xfa81ffa0 }, // std r20,-96(r1) + { "_savegpr0_21", 0xfaa1ffa8 }, // std r21,-88(r1) + { "_savegpr0_22", 0xfac1ffb0 }, // std r22,-80(r1) + { "_savegpr0_23", 0xfae1ffb8 }, // std r23,-72(r1) + { "_savegpr0_24", 0xfb01ffc0 }, // std r24,-64(r1) + { "_savegpr0_25", 0xfb21ffc8 }, // std r25,-56(r1) + { "_savegpr0_26", 0xfb41ffd0 }, // std r26,-48(r1) + { "_savegpr0_27", 0xfb61ffd8 }, // std r27,-40(r1) + { "_savegpr0_28", 0xfb81ffe0 }, // std r28,-32(r1) + { "_savegpr0_29", 0xfba1ffe8 }, // std r29,-24(r1) + { "_savegpr0_30", 0xfbc1fff0 }, // std r30,-16(r1) + { "_savegpr0_31", 0xfbe1fff8 }, // std r31,-8(r1) + { "", 0xf8010010 }, // std r0,16(r1) + { "", 0x4e800020 }, // blr + + { "_restgpr0_14", 0xe9c1ff70 }, // ld r14,-144(r1) + { "_restgpr0_15", 0xe9e1ff78 }, // ld r15,-136(r1) + { "_restgpr0_16", 0xea01ff80 }, // ld r16,-128(r1) + { "_restgpr0_17", 0xea21ff88 }, // ld r17,-120(r1) + { "_restgpr0_18", 0xea41ff90 }, // ld r18,-112(r1) + { "_restgpr0_19", 0xea61ff98 }, // ld r19,-104(r1) + { "_restgpr0_20", 0xea81ffa0 }, // ld r20,-96(r1) + { "_restgpr0_21", 0xeaa1ffa8 }, // ld r21,-88(r1) + { "_restgpr0_22", 0xeac1ffb0 }, // ld r22,-80(r1) + { "_restgpr0_23", 0xeae1ffb8 }, // ld r23,-72(r1) + { "_restgpr0_24", 0xeb01ffc0 }, // ld r24,-64(r1) + { "_restgpr0_25", 0xeb21ffc8 }, // ld r25,-56(r1) + { "_restgpr0_26", 0xeb41ffd0 }, // ld r26,-48(r1) + { "_restgpr0_27", 0xeb61ffd8 }, // ld r27,-40(r1) + { "_restgpr0_28", 0xeb81ffe0 }, // ld r28,-32(r1) + { "_restgpr0_29", 0xe8010010 }, // ld r0,16(r1) + { "", 0xeba1ffe8 }, // ld r29,-24(r1) + { "", 0x7c0803a6 }, // mtlr r0 + { "", 0xebc1fff0 }, // ld r30,-16(r1) + { "", 0xebe1fff8 }, // ld r31,-8(r1) + { "", 0x4e800020 }, // blr + { "_restgpr0_30", 0xebc1fff0 }, // ld r30,-16(r1) + { "_restgpr0_31", 0xe8010010 }, // ld r0,16(r1) + { "", 0xebe1fff8 }, // ld r31,-8(r1) + { "", 0x7c0803a6 }, // mtlr r0 + { "", 0x4e800020 }, // blr + + { "_savegpr1_14", 0xf9ccff70 }, // std r14,-144(r12) + { "_savegpr1_15", 0xf9ecff78 }, // std r15,-136(r12) + { "_savegpr1_16", 0xfa0cff80 }, // std r16,-128(r12) + { "_savegpr1_17", 0xfa2cff88 }, // std r17,-120(r12) + { "_savegpr1_18", 0xfa4cff90 }, // std r18,-112(r12) + { "_savegpr1_19", 0xfa6cff98 }, // std r19,-104(r12) + { "_savegpr1_20", 0xfa8cffa0 }, // std r20,-96(r12) + { "_savegpr1_21", 0xfaacffa8 }, // std r21,-88(r12) + { "_savegpr1_22", 0xfaccffb0 }, // std r22,-80(r12) + { "_savegpr1_23", 0xfaecffb8 }, // std r23,-72(r12) + { "_savegpr1_24", 0xfb0cffc0 }, // std r24,-64(r12) + { "_savegpr1_25", 0xfb2cffc8 }, // std r25,-56(r12) + { "_savegpr1_26", 0xfb4cffd0 }, // std r26,-48(r12) + { "_savegpr1_27", 0xfb6cffd8 }, // std r27,-40(r12) + { "_savegpr1_28", 0xfb8cffe0 }, // std r28,-32(r12) + { "_savegpr1_29", 0xfbacffe8 }, // std r29,-24(r12) + { "_savegpr1_30", 0xfbccfff0 }, // std r30,-16(r12) + { "_savegpr1_31", 0xfbecfff8 }, // std r31,-8(r12) + { "", 0x4e800020 }, // blr + + { "_restgpr1_14", 0xe9ccff70 }, // ld r14,-144(r12) + { "_restgpr1_15", 0xe9ecff78 }, // ld r15,-136(r12) + { "_restgpr1_16", 0xea0cff80 }, // ld r16,-128(r12) + { "_restgpr1_17", 0xea2cff88 }, // ld r17,-120(r12) + { "_restgpr1_18", 0xea4cff90 }, // ld r18,-112(r12) + { "_restgpr1_19", 0xea6cff98 }, // ld r19,-104(r12) + { "_restgpr1_20", 0xea8cffa0 }, // ld r20,-96(r12) + { "_restgpr1_21", 0xeaacffa8 }, // ld r21,-88(r12) + { "_restgpr1_22", 0xeaccffb0 }, // ld r22,-80(r12) + { "_restgpr1_23", 0xeaecffb8 }, // ld r23,-72(r12) + { "_restgpr1_24", 0xeb0cffc0 }, // ld r24,-64(r12) + { "_restgpr1_25", 0xeb2cffc8 }, // ld r25,-56(r12) + { "_restgpr1_26", 0xeb4cffd0 }, // ld r26,-48(r12) + { "_restgpr1_27", 0xeb6cffd8 }, // ld r27,-40(r12) + { "_restgpr1_28", 0xeb8cffe0 }, // ld r28,-32(r12) + { "_restgpr1_29", 0xebacffe8 }, // ld r29,-24(r12) + { "_restgpr1_30", 0xebccfff0 }, // ld r30,-16(r12) + { "_restgpr1_31", 0xebecfff8 }, // ld r31,-8(r12) + { "", 0x4e800020 }, // blr +}; + +void PPC64SaveRestoreSection::copy_buf(Context &ctx) { + ul32 *buf = (ul32 *)(ctx.buf + this->shdr.sh_offset); + for (auto [label, insn] : ppc64_save_restore_insns) + *buf++ = insn; +} + template <> u64 get_eflags(Context &ctx) { return 2; } -} // namespace mold::elf +} // namespace mold diff --git a/elf/arch-riscv.cc b/src/arch-riscv.cc similarity index 77% rename from elf/arch-riscv.cc rename to src/arch-riscv.cc index cf7b05c2..fd600b61 100644 --- a/elf/arch-riscv.cc +++ b/src/arch-riscv.cc @@ -12,71 +12,20 @@ // From the linker's point of view, the RISC-V's psABI is unique because // sections in input object files can be shrunk while being copied to the // output file. That is contrary to other psABIs in which sections are an -// atomic unit of copying. Let me explain it in more details. -// -// Since RISC-V instructions are 16-bit or 32-bit long, there's no way to -// embed a very large immediate into a branch instruction. In fact, JAL -// (jump and link) instruction can jump to only within PC ± 1 MiB because -// its immediate is only 21 bits long. If the destination is out of its -// reach, we need to use two instructions instead; the first instruction -// being AUIPC which sets upper 20 bits to a register and the second being -// JALR with a 12-bit immediate and the register. Combined, they specify a -// 32 bits displacement. -// -// Other RISC ISAs have the same limitation, and they solved the problem by -// letting the linker create so-called "range extension thunks". It works as -// follows: the compiler optimistically emits single jump instructions for -// function calls. If the linker finds that a branch target is out of reach, -// it emits a small piece of machine code near the branch instruction and -// redirect the branch to the linker-synthesized code. The code constructs a -// full 32-bit address in a register and jump to the destination. That -// linker-synthesized code is called "range extension thunks" or just -// "thunks". -// -// The RISC-V psABI is unique that it works the other way around. That is, -// for RISC-V, the compiler always emits two instructions (AUIPC + JAL) for -// function calls. If the linker finds the destination is reachable with a -// single instruction, it replaces the two instructions with the one and -// shrink the section size by one instruction length, instead of filling the -// gap with a nop. -// -// With the presence of this relaxation, sections can no longer be -// considered as an atomic unit. If we delete 4 bytes from the middle of a -// section, all contents after that point needs to be shifted by 4. Symbol -// values and relocation offsets have to be adjusted accordingly if they -// refer to past the deleted bytes. -// -// In mold, we use `r_deltas` to memorize how many bytes have be adjusted -// for relocations. For symbols, we directly mutate their `value` member. -// -// RISC-V object files tend to have way more relocations than those for -// other targets. This is because all branches, including ones that jump -// within the same section, are explicitly expressed with relocations. -// Here is why we need them: all control-flow statements such as `if` or -// `for` are implemented using branch instructions. For other targets, the -// compiler doesn't emit relocations for such branches because they know -// at compile-time exactly how many bytes has to be skipped. That's not -// true to RISC-V because the linker may delete bytes between a branch and -// its destination. Therefore, all branches including in-section ones have -// to be explicitly expressed with relocations. -// -// Note that this mechanism only shrink sections and never enlarge, as -// the compiler always emits the longest instruction sequence. This -// makes the linker implementation a bit simpler because we don't need -// to worry about oscillation. +// atomic unit of copying. See file comments in shrink-sections.cc for +// details. // // https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc #if MOLD_RV64LE || MOLD_RV64BE || MOLD_RV32LE || MOLD_RV32BE -#include "elf.h" #include "mold.h" #include #include #include -namespace mold::elf { +namespace mold { using E = MOLD_TARGET; @@ -141,7 +90,7 @@ static void set_rs1(u8 *loc, u32 rs1) { template <> void write_plt_header(Context &ctx, u8 *buf) { - static const ul32 insn_64[] = { + constexpr ul32 insn_64[] = { 0x0000'0397, // auipc t2, %pcrel_hi(.got.plt) 0x41c3'0333, // sub t1, t1, t3 # .plt entry + hdr + 12 0x0003'be03, // ld t3, %pcrel_lo(1b)(t2) # _dl_runtime_resolve @@ -152,7 +101,7 @@ void write_plt_header(Context &ctx, u8 *buf) { 0x000e'0067, // jr t3 }; - static const ul32 insn_32[] = { + constexpr ul32 insn_32[] = { 0x0000'0397, // auipc t2, %pcrel_hi(.got.plt) 0x41c3'0333, // sub t1, t1, t3 # .plt entry + hdr + 12 0x0003'ae03, // lw t3, %pcrel_lo(1b)(t2) # _dl_runtime_resolve @@ -172,18 +121,18 @@ void write_plt_header(Context &ctx, u8 *buf) { write_itype(buf + 16, gotplt - plt); } -static const ul32 plt_entry_64[] = { +constexpr ul32 plt_entry_64[] = { 0x0000'0e17, // auipc t3, %pcrel_hi(function@.got.plt) 0x000e'3e03, // ld t3, %pcrel_lo(1b)(t3) 0x000e'0367, // jalr t1, t3 - 0x0000'0013, // nop + 0x0010'0073, // ebreak }; -static const ul32 plt_entry_32[] = { +constexpr ul32 plt_entry_32[] = { 0x0000'0e17, // auipc t3, %pcrel_hi(function@.got.plt) 0x000e'2e03, // lw t3, %pcrel_lo(1b)(t3) 0x000e'0367, // jalr t1, t3 - 0x0000'0013, // nop + 0x0010'0073, // ebreak }; template <> @@ -261,11 +210,6 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); u64 GP = ctx.__global_pointer ? ctx.__global_pointer->get_addr(ctx) : 0; - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - auto get_r_delta = [&](i64 idx) { return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx]; }; @@ -316,12 +260,8 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { case R_RISCV_32: if constexpr (E::is_64) *(U32 *)loc = S + A; - else - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); break; case R_RISCV_64: - assert(E::is_64); - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); break; case R_RISCV_BRANCH: check(S + A - P, -(1 << 12), 1 << 12); @@ -363,9 +303,55 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { } break; } - case R_RISCV_GOT_HI20: - write_utype(loc, G + GOT + A - P); + case R_RISCV_GOT_HI20: { + // This relocation usually refers to an AUIPC + LD instruction + // pair to load a symbol value from the GOT. If the symbol value + // is actually a link-time constant, we can materialize the value + // directly into a register to eliminate a memory load. + i64 rd = get_rd(rel.r_offset); + + switch (removed_bytes) { + case 6: + // c.li , val + *(ul16 *)loc = 0b010'0'00000'00000'01 | (rd << 7); + write_citype(loc, sym.get_addr(ctx)); + i += 3; + break; + case 4: + // addi , zero, val + *(ul32 *)loc = 0b0010011 | (rd << 7); + write_itype(loc, sym.get_addr(ctx)); + i += 3; + break; + case 0: + if (ctx.arg.relax && + sym.is_pcrel_linktime_const(ctx) && + i + 3 < rels.size() && + rels[i + 1].r_type == R_RISCV_RELAX && + rels[i + 2].r_type == R_RISCV_PCREL_LO12_I && + rels[i + 2].r_offset == rels[i].r_offset + 4 && + file.symbols[rels[i + 2].r_sym]->value == r_offset && + rels[i + 3].r_type == R_RISCV_RELAX) { + i64 val = S + A - P; + if (rd == get_rd(rel.r_offset + 4) && (i32)val == val) { + // auipc , %hi20(val) + write_utype(loc, val); + + // addi , , %lo12(val) + *(ul32 *)(loc + 4) = 0b0010011 | (rd << 15) | (rd << 7); + write_itype(loc + 4, val); + i += 3; + break; + } + } + + write_utype(loc, G + GOT + A - P); + break; + default: + unreachable(); + } break; + } case R_RISCV_TLS_GOT_HI20: write_utype(loc, sym.get_gottp_addr(ctx) + A - P); break; @@ -466,12 +452,49 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { break; } case R_RISCV_TLSDESC_HI20: + // RISC-V TLSDESC uses the following code sequence to materialize + // a TP-relative address in x0. + // + // .L0: + // auipc tX, 0 + // R_RISCV_TLSDESC_HI20 foo + // l[d|w] tY, tX, 0 + // R_RISCV_TLSDESC_LOAD_LO12_I .L0 + // addi a0, tX, 0 + // R_RISCV_TLSDESC_ADD_LO12_I .L0 + // jalr t0, tY + // R_RISCV_TLSDESC_CALL .L0 + // + // For non-dlopen'd DSO, we may relax the instructions to the following: + // + // + // + // auipc a0, %gottp_hi(a0) + // l[d|w] a0, %gottp_lo(a0) + // + // For executable, if the TP offset is small enough, we'll relax + // it to the following: + // + // + // + // + // addi a0, zero, %tpoff_lo(a0) + // + // Otherwise, the following sequence is used: + // + // + // + // lui a0, %tpoff_hi(a0) + // addi a0, a0, %tpoff_lo(a0) if (removed_bytes == 0) write_utype(loc, sym.get_tlsdesc_addr(ctx) + A - P); break; case R_RISCV_TLSDESC_LOAD_LO12: case R_RISCV_TLSDESC_ADD_LO12: case R_RISCV_TLSDESC_CALL: { + if (removed_bytes == 4) + break; + i64 idx2 = find_paired_reloc(); const ElfRel &rel2 = rels[idx2]; Symbol &sym2 = *file.symbols[rel2.r_sym]; @@ -482,27 +505,24 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { switch (rel.r_type) { case R_RISCV_TLSDESC_LOAD_LO12: - if (sym2.has_tlsdesc(ctx)) - write_itype(loc, sym2.get_tlsdesc_addr(ctx) + A - P); + write_itype(loc, sym2.get_tlsdesc_addr(ctx) + A - P); break; case R_RISCV_TLSDESC_ADD_LO12: if (sym2.has_tlsdesc(ctx)) { write_itype(loc, sym2.get_tlsdesc_addr(ctx) + A - P); } else if (sym2.has_gottp(ctx)) { - *(ul32 *)loc = 0x517; // auipc a0, + *(ul32 *)loc = 0x517; // auipc a0, write_utype(loc, sym2.get_gottp_addr(ctx) + A - P); } else { - if (removed_bytes == 0) { - *(ul32 *)loc = 0x537; // lui a0, - write_utype(loc, S + A - ctx.tp_addr); - } + *(ul32 *)loc = 0x537; // lui a0, + write_utype(loc, S + A - ctx.tp_addr); } break; case R_RISCV_TLSDESC_CALL: if (sym2.has_tlsdesc(ctx)) { // Do nothing } else if (sym2.has_gottp(ctx)) { - // {ld,lw} a0, (a0) + // l[d|w] a0, *(ul32 *)loc = E::is_64 ? 0x53503 : 0x52503; write_itype(loc, sym2.get_gottp_addr(ctx) + A - P); } else { @@ -684,38 +704,9 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { } } -template <> -void InputSection::copy_contents_riscv(Context &ctx, u8 *buf) { - // If a section is not relaxed, we can copy it as a one big chunk. - if (extra.r_deltas.empty()) { - uncompress_to(ctx, buf); - return; - } - - // A relaxed section is copied piece-wise. - std::span> rels = get_rels(ctx); - i64 pos = 0; - - for (i64 i = 0; i < rels.size(); i++) { - i64 delta = extra.r_deltas[i + 1] - extra.r_deltas[i]; - if (delta == 0) - continue; - assert(delta > 0); - - const ElfRel &r = rels[i]; - memcpy(buf, contents.data() + pos, r.r_offset - pos); - buf += r.r_offset - pos; - pos = r.r_offset + delta; - } - - memcpy(buf, contents.data() + pos, contents.size() - pos); -} - template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); std::span> rels = get_rels(ctx); // Scan relocations @@ -733,17 +724,10 @@ void InputSection::scan_relocations(Context &ctx) { case R_RISCV_32: if constexpr (E::is_64) scan_absrel(ctx, sym, rel); - else - scan_dyn_absrel(ctx, sym, rel); break; case R_RISCV_HI20: scan_absrel(ctx, sym, rel); break; - case R_RISCV_64: - if constexpr (!E::is_64) - Fatal(ctx) << *this << ": R_RISCV_64 cannot be used on RV32"; - scan_dyn_absrel(ctx, sym, rel); - break; case R_RISCV_CALL: case R_RISCV_CALL_PLT: case R_RISCV_PLT32: @@ -774,8 +758,9 @@ void InputSection::scan_relocations(Context &ctx) { break; case R_RISCV_GPREL_HI20: if (ctx.arg.shared) - Fatal(ctx) << *this << ": R_RISCV_GPREL_HI20 may not be used with -shared"; + Error(ctx) << *this << ": R_RISCV_GPREL_HI20 may not be used with -shared"; break; + case R_RISCV_64: case R_RISCV_BRANCH: case R_RISCV_JAL: case R_RISCV_PCREL_LO12_I: @@ -838,34 +823,9 @@ u64 get_eflags(Context &ctx) { return ret; } -static bool is_resizable(InputSection *isec) { - return isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC) && - (isec->shdr().sh_flags & SHF_EXECINSTR); -} - -// Returns the distance between a relocated place and a symbol. -static i64 compute_distance(Context &ctx, Symbol &sym, - InputSection &isec, const ElfRel &rel) { - // We handle absolute symbols as if they were infinitely far away - // because `shrink_section` may increase a distance between a branch - // instruction and an absolute symbol. Branching to an absolute - // location is extremely rare in real code, though. - if (sym.is_absolute()) - return INT32_MAX; - - // Likewise, relocations against weak undefined symbols won't be relaxed. - if (sym.esym().is_undef_weak()) - return INT32_MAX; - - // Compute a distance between the relocated place and the symbol. - i64 S = sym.get_addr(ctx); - i64 A = rel.r_addend; - i64 P = isec.get_addr() + rel.r_offset; - return S + A - P; -} - -// Scan relocations to shrink sections. -static void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { +// Scan relocations to a given shrink section. +template <> +void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { std::span> rels = isec.get_rels(ctx); isec.extra.r_deltas.resize(rels.size() + 1); @@ -949,6 +909,34 @@ static void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) } break; } + case R_RISCV_GOT_HI20: { + // A GOT_HI20 followed by a PCREL_LO12_I is used to load a value from + // GOT. If the loaded value is a link-time constant, we can rewrite + // the instructions to directly materialize the value, eliminating a + // memory load. + if (sym.is_absolute() && + i + 3 < rels.size() && + rels[i + 1].r_type == R_RISCV_RELAX && + rels[i + 2].r_type == R_RISCV_PCREL_LO12_I && + rels[i + 2].r_offset == rels[i].r_offset + 4 && + isec.file.symbols[rels[i + 2].r_sym]->value == rels[i].r_offset && + rels[i + 3].r_type == R_RISCV_RELAX) { + i64 rd = get_rd(r.r_offset); + + if (rd == get_rd(r.r_offset + 4)) { + u64 val = sym.get_addr(ctx) + r.r_addend; + + if (use_rvc && rd != 0 && sign_extend(val, 5) == val) { + // Replace AUIPC + LD with C.LI. + delta += 6; + } else if (sign_extend(val, 11) == val) { + // Replace AUIPC + LD with ADDI. + delta += 4; + } + } + } + break; + } case R_RISCV_HI20: { u64 val = sym.get_addr(ctx) + r.r_addend; i64 rd = get_rd(r.r_offset); @@ -1018,55 +1006,6 @@ static void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) isec.sh_size -= delta; } -// Shrink sections by interpreting relocations. -// -// This operation seems to be optional, because by default longest -// instructions are being used. However, calling this function is actually -// mandatory because of R_RISCV_ALIGN. R_RISCV_ALIGN is a directive to the -// linker to align the location referred to by the relocation to a -// specified byte boundary. We at least have to interpret them to satisfy -// the alignment constraints. -template <> -i64 riscv_resize_sections(Context &ctx) { - Timer t(ctx, "riscv_resize_sections"); - - // True if we can use the 2-byte instructions. This is usually true on - // Unix because RV64GC is generally considered the baseline hardware. - bool use_rvc = get_eflags(ctx) & EF_RISCV_RVC; - - // Find all the relocations that can be relaxed. - // This step should only shrink sections. - tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { - for (std::unique_ptr> &isec : file->sections) - if (is_resizable(isec.get())) - shrink_section(ctx, *isec, use_rvc); - }); - - // Fix symbol values. - tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { - for (Symbol *sym : file->symbols) { - if (sym->file != file) - continue; - - InputSection *isec = sym->get_input_section(); - if (!isec || isec->extra.r_deltas.empty()) - continue; - - std::span> rels = isec->get_rels(ctx); - auto it = std::lower_bound(rels.begin(), rels.end(), sym->value, - [&](const ElfRel &r, u64 val) { - return r.r_offset < val; - }); - - sym->value -= isec->extra.r_deltas[it - rels.begin()]; - } - }); - - // Re-compute section offset again to finalize them. - compute_section_sizes(ctx); - return set_osec_offsets(ctx); -} - // ISA name handlers // // An example of ISA name is "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0". @@ -1082,8 +1021,8 @@ i64 riscv_resize_sections(Context &ctx) { // Each extension consists of a name, a major version and a minor version. // For example, "m2p0" indicates the "m" extension of version 2.0. "p" is // just a separator. Versions are often omitted in documents, but they are -// mandatory in .riscv.attributes. Likewise, abbreviations as "g" (which -// is short for "IMAFD") are not allowed in .riscv.attributes. +// mandatory in .riscv.attributes. Likewise, abbreviations such as "G" +// (which is short for "IMAFD") are not allowed in .riscv.attributes. // // Each RISC-V object file contains an ISA string enumerating extensions // used by the object file. We need to merge input objects' ISA strings @@ -1109,7 +1048,7 @@ struct Extn { // // This function returns true if the first extension name should precede // the second one as per the rule. -static bool extn_name_less(const Extn &e1, const Extn &e2) { +static bool extn_name_less(std::string_view x, std::string_view y) { auto get_single_letter_rank = [](char c) -> i64 { std::string_view exts = "iemafdqlcbkjtpvnh"; size_t pos = exts.find_first_of(c); @@ -1131,57 +1070,26 @@ static bool extn_name_less(const Extn &e1, const Extn &e2) { } }; - return std::tuple{get_rank(e1.name), e1.name} < - std::tuple{get_rank(e2.name), e2.name}; -} - -static bool extn_version_less(const Extn &e1, const Extn &e2) { - return std::tuple{e1.major, e1.minor} < - std::tuple{e2.major, e2.minor}; -} - -static std::optional read_extn_string(std::string_view &str) { - auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript; - static std::regex re(R"(^([a-z]+)(\d+)p(\d+))", flags); - - std::cmatch m; - if (std::regex_search(str.data(), str.data() + str.size(), m, re)) { - str = str.substr(m.length()); - return Extn{m[1], (i64)std::stoul(m[2]), (i64)std::stoul(m[3])}; - } - return {}; + return std::tuple{get_rank(x), x} < std::tuple{get_rank(y), y}; } static std::vector parse_arch_string(std::string_view str) { - if (str.size() < 5) - return {}; - - // Parse the base part - std::string_view base = str.substr(0, 5); - if (base != "rv32i" && base != "rv32e" && base != "rv64i" && base != "rv64e") - return {}; - str = str.substr(4); - - std::optional extn = read_extn_string(str); - if (!extn) - return {}; + auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript; + static std::regex re(R"(^([a-z]|[a-z][a-z0-9]*[a-z])(\d+)p(\d+)(_|$))", flags); std::vector vec; - extn->name = base; - vec.push_back(*extn); - // Parse extensions - while (!str.empty()) { - if (str[0] != '_') + for (;;) { + std::cmatch m; + if (!std::regex_search(str.data(), str.data() + str.size(), m, re)) return {}; - str = str.substr(1); - std::optional extn = read_extn_string(str); - if (!extn) - return {}; - vec.push_back(*extn); + vec.push_back(Extn{m[1], (i64)std::stoul(m[2]), (i64)std::stoul(m[3])}); + if (m[4].length() == 0) + return vec; + + str = str.substr(m.length()); } - return vec; } static std::vector merge_extensions(std::span x, std::span y) { @@ -1194,10 +1102,13 @@ static std::vector merge_extensions(std::span x, std::span y) // Merge ISA extension strings while (!x.empty() && !y.empty()) { if (x[0].name == y[0].name) { - vec.push_back(extn_version_less(x[0], y[0]) ? y[0] : x[0]); + if (std::tuple{x[0].major, x[0].minor} < std::tuple{y[0].major, y[0].minor}) + vec.push_back(y[0]); + else + vec.push_back(x[0]); x = x.subspan(1); y = y.subspan(1); - } else if (extn_name_less(x[0], y[0])) { + } else if (extn_name_less(x[0].name, y[0].name)) { vec.push_back(x[0]); x = x.subspan(1); } else { @@ -1206,18 +1117,19 @@ static std::vector merge_extensions(std::span x, std::span y) } } - vec.insert(vec.end(), x.begin(), x.end()); - vec.insert(vec.end(), y.begin(), y.end()); + append(vec, x); + append(vec, y); return vec; } -static std::string to_string(std::span v) { - std::string str = v[0].name + std::to_string(v[0].major) + "p" + - std::to_string(v[0].minor); +static std::string to_string(const Extn &e) { + return e.name + std::to_string(e.major) + "p" + std::to_string(e.minor); +} +static std::string to_string(std::span v) { + std::string str = to_string(v[0]); for (i64 i = 1; i < v.size(); i++) - str += "_" + v[i].name + std::to_string(v[i].major) + "p" + - std::to_string(v[i].minor); + str += "_" + to_string(v[i]); return str; } @@ -1300,9 +1212,9 @@ void RiscvAttributesSection::update_shdr(Context &ctx) { template <> void RiscvAttributesSection::copy_buf(Context &ctx) { - memcpy(ctx.buf + this->shdr.sh_offset, contents.data(), contents.size()); + write_vector(ctx.buf + this->shdr.sh_offset, contents); } -} // namespace mold::elf +} // namespace mold #endif diff --git a/elf/arch-s390x.cc b/src/arch-s390x.cc similarity index 94% rename from elf/arch-s390x.cc rename to src/arch-s390x.cc index 5fe7539d..dedc607c 100644 --- a/elf/arch-s390x.cc +++ b/src/arch-s390x.cc @@ -37,7 +37,7 @@ #include "mold.h" -namespace mold::elf { +namespace mold { using E = S390X; @@ -116,11 +116,6 @@ template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) @@ -153,7 +148,6 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { switch (rel.r_type) { case R_390_64: - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); break; case R_390_8: check(S + A, 0, 1 << 8); @@ -256,7 +250,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { *(ub32 *)loc = (GOT + A - P) >> 1; break; case R_390_GOTENT: - check(GOT + G + A - P, -(1LL << 32), 1LL << 32); + check_dbl(GOT + G + A - P, -(1LL << 32), 1LL << 32); *(ub32 *)loc = (GOT + G + A - P) >> 1; break; case R_390_TLS_LE32: @@ -303,22 +297,14 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { case R_390_TLS_LDM32: if (ctx.got->has_tlsld(ctx)) *(ub32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT; + else + *(ub32 *)loc = ctx.dtp_addr - ctx.tp_addr; break; case R_390_TLS_LDM64: if (ctx.got->has_tlsld(ctx)) *(ub64 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT; - break; - case R_390_TLS_LDO32: - if (ctx.got->has_tlsld(ctx)) - *(ub32 *)loc = S + A - ctx.dtp_addr; - else - *(ub32 *)loc = S + A - ctx.tp_addr; - break; - case R_390_TLS_LDO64: - if (ctx.got->has_tlsld(ctx)) - *(ub64 *)loc = S + A - ctx.dtp_addr; else - *(ub64 *)loc = S + A - ctx.tp_addr; + *(ub64 *)loc = ctx.dtp_addr - ctx.tp_addr; break; case R_390_TLS_LDCALL: if (!ctx.got->has_tlsld(ctx)) { @@ -327,6 +313,12 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { memcpy(loc, insn, sizeof(insn)); } break; + case R_390_TLS_LDO32: + *(ub32 *)loc = S + A - ctx.dtp_addr; + break; + case R_390_TLS_LDO64: + *(ub64 *)loc = S + A - ctx.dtp_addr; + break; default: unreachable(); } @@ -385,8 +377,6 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); std::span> rels = get_rels(ctx); // Scan relocations @@ -401,9 +391,6 @@ void InputSection::scan_relocations(Context &ctx) { sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { - case R_390_64: - scan_dyn_absrel(ctx, sym, rel); - break; case R_390_8: case R_390_12: case R_390_16: @@ -457,8 +444,7 @@ void InputSection::scan_relocations(Context &ctx) { // We always want to relax calls to __tls_get_offset() in statically- // linked executables because __tls_get_offset() in libc.a just calls // abort(). - if ((ctx.arg.relax && sym.is_tprel_linktime_const(ctx)) || - ctx.arg.is_static) { + if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) { // Do nothing } else if (ctx.arg.relax && sym.is_tprel_runtime_const(ctx)) { sym.flags |= NEEDS_GOTTP; @@ -468,7 +454,7 @@ void InputSection::scan_relocations(Context &ctx) { break; case R_390_TLS_LDM32: case R_390_TLS_LDM64: - if (ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared)) { + if (ctx.arg.static_ || (ctx.arg.relax && !ctx.arg.shared)) { // Do nothing } else { ctx.needs_tlsld = true; @@ -478,6 +464,7 @@ void InputSection::scan_relocations(Context &ctx) { case R_390_TLS_LE64: check_tlsle(ctx, sym, rel); break; + case R_390_64: case R_390_TLS_LDO32: case R_390_TLS_LDO64: case R_390_TLS_GDCALL: @@ -489,4 +476,4 @@ void InputSection::scan_relocations(Context &ctx) { } } -} // namespace mold::elf +} // namespace mold diff --git a/elf/arch-sh4.cc b/src/arch-sh4.cc similarity index 95% rename from elf/arch-sh4.cc rename to src/arch-sh4.cc index 46d61128..8e5d336a 100644 --- a/elf/arch-sh4.cc +++ b/src/arch-sh4.cc @@ -60,7 +60,7 @@ #include "mold.h" -namespace mold::elf { +namespace mold { using E = SH4; @@ -90,6 +90,28 @@ i64 get_addend(u8 *loc, const ElfRel &rel) { } } +template <> +void write_addend(u8 *loc, i64 val, const ElfRel &rel) { + switch (rel.r_type) { + case R_SH_DIR32: + case R_SH_REL32: + case R_SH_TLS_GD_32: + case R_SH_TLS_LD_32: + case R_SH_TLS_LDO_32: + case R_SH_TLS_IE_32: + case R_SH_TLS_LE_32: + case R_SH_TLS_DTPMOD32: + case R_SH_TLS_DTPOFF32: + case R_SH_TLS_TPOFF32: + case R_SH_GOT32: + case R_SH_PLT32: + case R_SH_GOTOFF: + case R_SH_GOTPC: + case R_SH_GOTPLT32: + *(ul32 *)loc = val; + } +} + template <> void write_plt_header(Context &ctx, u8 *buf) { if (ctx.arg.pic) { @@ -208,11 +230,6 @@ template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) @@ -229,7 +246,6 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { switch (rel.r_type) { case R_SH_DIR32: - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); break; case R_SH_REL32: case R_SH_PLT32: @@ -301,8 +317,6 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { @@ -316,9 +330,6 @@ void InputSection::scan_relocations(Context &ctx) { Error(ctx) << sym << ": GNU ifunc symbol is not supported on sh4"; switch (rel.r_type) { - case R_SH_DIR32: - scan_dyn_absrel(ctx, sym, rel); - break; case R_SH_REL32: scan_pcrel(ctx, sym, rel); break; @@ -341,6 +352,7 @@ void InputSection::scan_relocations(Context &ctx) { case R_SH_TLS_LE_32: check_tlsle(ctx, sym, rel); break; + case R_SH_DIR32: case R_SH_GOTPC: case R_SH_GOTOFF: case R_SH_TLS_LDO_32: @@ -351,4 +363,4 @@ void InputSection::scan_relocations(Context &ctx) { } } -} // namespace mold::elf +} // namespace mold diff --git a/elf/arch-sparc64.cc b/src/arch-sparc64.cc similarity index 84% rename from elf/arch-sparc64.cc rename to src/arch-sparc64.cc index bebbe11d..b04bb301 100644 --- a/elf/arch-sparc64.cc +++ b/src/arch-sparc64.cc @@ -58,7 +58,7 @@ #include "mold.h" -namespace mold::elf { +namespace mold { using E = SPARC64; @@ -142,11 +142,6 @@ template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) @@ -169,9 +164,6 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { u64 GOT = ctx.got->shdr.sh_addr; switch (rel.r_type) { - case R_SPARC_64: - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); - break; case R_SPARC_5: check(S + A, 0, 1 << 5); *(ub32 *)loc |= bits(S + A, 4, 0); @@ -359,27 +351,75 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { *(ub32 *)loc |= bits(S + A, 11, 0); break; case R_SPARC_TLS_GD_HI22: - *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 31, 10); + if (sym.has_tlsgd(ctx)) { + *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 31, 10); + } else if (sym.has_gottp(ctx)) { + *(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 31, 10); + } else { + *(ub32 *)loc |= bits(~(S + A - ctx.tp_addr), 31, 10); + } break; case R_SPARC_TLS_GD_LO10: - *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 9, 0); + if (sym.has_tlsgd(ctx)) { + *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 9, 0); + } else if (sym.has_gottp(ctx)) { + u32 rd = bits(*(ub32 *)loc, 29, 25); + *(ub32 *)loc = 0x8010'2000 | (rd << 25) | (rd << 14); // or %reg, $0, %reg + *(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 9, 0); + } else { + u32 rd = bits(*(ub32 *)loc, 29, 25); + *(ub32 *)loc = 0x8018'2000 | (rd << 25) | (rd << 14); // xor %reg, $0, %reg + *(ub32 *)loc |= bits(S + A - ctx.tp_addr, 9, 0) | 0b1'1100'0000'0000; + } + break; + case R_SPARC_TLS_GD_ADD: + if (sym.has_tlsgd(ctx)) { + // do nothing + } else if (sym.has_gottp(ctx)) { + u32 rs2 = bits(*(ub32 *)loc, 4, 0); + *(ub32 *)loc = 0xd05d'c000 | rs2; // ldx [ %l7 + %reg ], %o0 + } else { + u32 rs2 = bits(*(ub32 *)loc, 4, 0); + *(ub32 *)loc = 0x9001'c000 | rs2; // add %g7, %reg, %o0 + } break; case R_SPARC_TLS_GD_CALL: - case R_SPARC_TLS_LDM_CALL: { - u64 addr; - if (ctx.arg.is_static) - addr = ctx.extra.tls_get_addr_sec->shdr.sh_addr; - else - addr = ctx.extra.tls_get_addr_sym->get_addr(ctx); - - *(ub32 *)loc |= bits(addr + A - P, 31, 2); + if (sym.has_tlsgd(ctx)) { + u64 addr = ctx.extra.tls_get_addr->get_addr(ctx); + *(ub32 *)loc |= bits(addr + A - P, 31, 2); + } else if (sym.has_gottp(ctx)) { + *(ub32 *)loc = 0x9001'c008; // add %g7, %o0, %o0 + } else { + *(ub32 *)loc = 0x0100'0000; // nop + } break; - } case R_SPARC_TLS_LDM_HI22: - *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 31, 10); + if (ctx.got->has_tlsld(ctx)) + *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 31, 10); + else + *(ub32 *)loc |= bits(ctx.tp_addr - ctx.tls_begin, 31, 10); break; case R_SPARC_TLS_LDM_LO10: - *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 9, 0); + if (ctx.got->has_tlsld(ctx)) + *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 9, 0); + else + *(ub32 *)loc |= bits(ctx.tp_addr - ctx.tls_begin, 9, 0); + break; + case R_SPARC_TLS_LDM_ADD: + if (ctx.got->has_tlsld(ctx)) { + // do nothing + } else { + u32 rs2 = bits(*(ub32 *)loc, 4, 0); + *(ub32 *)loc = 0x9021'c000 | rs2; // sub %g7, %reg, %o0 + } + break; + case R_SPARC_TLS_LDM_CALL: + if (ctx.got->has_tlsld(ctx)) { + u64 addr = ctx.extra.tls_get_addr->get_addr(ctx); + *(ub32 *)loc |= bits(addr + A - P, 31, 2); + } else { + *(ub32 *)loc = 0x0100'0000; // nop + } break; case R_SPARC_TLS_LDO_HIX22: *(ub32 *)loc |= bits(S + A - ctx.dtp_addr, 31, 10); @@ -402,8 +442,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { case R_SPARC_SIZE32: *(ub32 *)loc = sym.esym().st_size + A; break; - case R_SPARC_TLS_GD_ADD: - case R_SPARC_TLS_LDM_ADD: + case R_SPARC_64: case R_SPARC_TLS_LDO_ADD: case R_SPARC_TLS_IE_LD: case R_SPARC_TLS_IE_LDX: @@ -471,8 +510,6 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); std::span> rels = get_rels(ctx); // Scan relocations @@ -487,9 +524,6 @@ void InputSection::scan_relocations(Context &ctx) { sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { - case R_SPARC_64: - scan_dyn_absrel(ctx, sym, rel); - break; case R_SPARC_8: case R_SPARC_5: case R_SPARC_6: @@ -554,24 +588,36 @@ void InputSection::scan_relocations(Context &ctx) { scan_pcrel(ctx, sym, rel); break; case R_SPARC_TLS_GD_HI22: - sym.flags |= NEEDS_TLSGD; + if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) { + // We always relax if -static because libc.a doesn't contain + // __tls_get_addr(). + } else if (ctx.arg.relax && sym.is_tprel_runtime_const(ctx)) { + sym.flags |= NEEDS_GOTTP; + } else { + sym.flags |= NEEDS_TLSGD; + } break; case R_SPARC_TLS_LDM_HI22: - ctx.needs_tlsld = true; + if (ctx.arg.static_ || (ctx.arg.relax && !ctx.arg.shared)) { + // We always relax if -static because libc.a doesn't contain + // __tls_get_addr(). + } else { + ctx.needs_tlsld = true; + } break; case R_SPARC_TLS_IE_HI22: sym.flags |= NEEDS_GOTTP; break; case R_SPARC_TLS_GD_CALL: case R_SPARC_TLS_LDM_CALL: - if (!ctx.arg.is_static) - if (Symbol &sym = *ctx.extra.tls_get_addr_sym; sym.is_imported) - sym.flags |= NEEDS_PLT; + if (Symbol *sym = ctx.extra.tls_get_addr; sym->is_imported) + sym->flags |= NEEDS_PLT; break; case R_SPARC_TLS_LE_HIX22: case R_SPARC_TLS_LE_LOX10: check_tlsle(ctx, sym, rel); break; + case R_SPARC_64: case R_SPARC_GOTDATA_OP_LOX10: case R_SPARC_GOTDATA_OP: case R_SPARC_GOTDATA_LOX10: @@ -594,25 +640,4 @@ void InputSection::scan_relocations(Context &ctx) { } } -// __tls_get_addr is not defined by libc.a, so we can't use that function -// in statically-linked executables. This section provides a replacement. -void SparcTlsGetAddrSection::copy_buf(Context &ctx) { - ub32 *buf = (ub32 *)(ctx.buf + this->shdr.sh_offset); - - static const ub32 insn[] = { - 0x0300'0000, // sethi %hi(TP_SIZE), %g1 - 0x8210'6000, // or %g1, %lo(TP_SIZE), %g1 - 0x8221'c001, // sub %g7, %g1, %g1 - 0xd05a'2008, // ldx [ %o0 + 8 ], %o0 - 0x81c3'e008, // retl - 0x9000'4008, // add %g1, %o0, %o0 - }; - - assert(this->shdr.sh_size == sizeof(insn)); - memcpy(buf, insn, sizeof(insn)); - - buf[0] |= bits(ctx.tp_addr - ctx.tls_begin, 31, 10); - buf[1] |= bits(ctx.tp_addr - ctx.tls_begin, 9, 0); -} - -} // namespace mold::elf +} // namespace mold diff --git a/elf/arch-x86-64.cc b/src/arch-x86-64.cc similarity index 74% rename from elf/arch-x86-64.cc rename to src/arch-x86-64.cc index af86adec..4e0b5f93 100644 --- a/elf/arch-x86-64.cc +++ b/src/arch-x86-64.cc @@ -28,7 +28,9 @@ #include "mold.h" -namespace mold::elf { +#include + +namespace mold { using E = X86_64; @@ -178,6 +180,50 @@ static u32 relax_gottpoff(u8 *loc) { return 0; } +static u32 relax_tlsdesc_to_ie(u8 *loc) { + switch ((loc[0] << 16) | (loc[1] << 8) | loc[2]) { + case 0x488d05: return 0x488b05; // lea 0(%rip), %rax -> mov 0(%rip), %rax + case 0x488d0d: return 0x488b0d; // lea 0(%rip), %rcx -> mov 0(%rip), %rcx + case 0x488d15: return 0x488b15; // lea 0(%rip), %rdx -> mov 0(%rip), %rdx + case 0x488d1d: return 0x488b1d; // lea 0(%rip), %rbx -> mov 0(%rip), %rbx + case 0x488d25: return 0x488b25; // lea 0(%rip), %rsp -> mov 0(%rip), %rsp + case 0x488d2d: return 0x488b2d; // lea 0(%rip), %rbp -> mov 0(%rip), %rbp + case 0x488d35: return 0x488b35; // lea 0(%rip), %rsi -> mov 0(%rip), %rsi + case 0x488d3d: return 0x488b3d; // lea 0(%rip), %rdi -> mov 0(%rip), %rdi + case 0x4c8d05: return 0x4c8b05; // lea 0(%rip), %r8 -> mov 0(%rip), %r8 + case 0x4c8d0d: return 0x4c8b0d; // lea 0(%rip), %r9 -> mov 0(%rip), %r9 + case 0x4c8d15: return 0x4c8b15; // lea 0(%rip), %r10 -> mov 0(%rip), %r10 + case 0x4c8d1d: return 0x4c8b1d; // lea 0(%rip), %r11 -> mov 0(%rip), %r11 + case 0x4c8d25: return 0x4c8b25; // lea 0(%rip), %r12 -> mov 0(%rip), %r12 + case 0x4c8d2d: return 0x4c8b2d; // lea 0(%rip), %r13 -> mov 0(%rip), %r13 + case 0x4c8d35: return 0x4c8b35; // lea 0(%rip), %r14 -> mov 0(%rip), %r14 + case 0x4c8d3d: return 0x4c8b3d; // lea 0(%rip), %r15 -> mov 0(%rip), %r15 + } + return 0; +} + +static u32 relax_tlsdesc_to_le(u8 *loc) { + switch ((loc[0] << 16) | (loc[1] << 8) | loc[2]) { + case 0x488d05: return 0x48c7c0; // lea 0(%rip), %rax -> mov $0, %rax + case 0x488d0d: return 0x48c7c1; // lea 0(%rip), %rcx -> mov $0, %rcx + case 0x488d15: return 0x48c7c2; // lea 0(%rip), %rdx -> mov $0, %rdx + case 0x488d1d: return 0x48c7c3; // lea 0(%rip), %rbx -> mov $0, %rbx + case 0x488d25: return 0x48c7c4; // lea 0(%rip), %rsp -> mov $0, %rsp + case 0x488d2d: return 0x48c7c5; // lea 0(%rip), %rbp -> mov $0, %rbp + case 0x488d35: return 0x48c7c6; // lea 0(%rip), %rsi -> mov $0, %rsi + case 0x488d3d: return 0x48c7c7; // lea 0(%rip), %rdi -> mov $0, %rdi + case 0x4c8d05: return 0x49c7c0; // lea 0(%rip), %r8 -> mov $0, %r8 + case 0x4c8d0d: return 0x49c7c1; // lea 0(%rip), %r9 -> mov $0, %r9 + case 0x4c8d15: return 0x49c7c2; // lea 0(%rip), %r10 -> mov $0, %r10 + case 0x4c8d1d: return 0x49c7c3; // lea 0(%rip), %r11 -> mov $0, %r11 + case 0x4c8d25: return 0x49c7c4; // lea 0(%rip), %r12 -> mov $0, %r12 + case 0x4c8d2d: return 0x49c7c5; // lea 0(%rip), %r13 -> mov $0, %r13 + case 0x4c8d35: return 0x49c7c6; // lea 0(%rip), %r14 -> mov $0, %r14 + case 0x4c8d3d: return 0x49c7c7; // lea 0(%rip), %r15 -> mov $0, %r15 + } + return 0; +} + // Rewrite a function call to __tls_get_addr to a cheaper instruction // sequence. We can do this when we know the thread-local variable's TP- // relative address at link-time. @@ -255,10 +301,10 @@ static void relax_gd_to_ie(u8 *loc, ElfRel rel, u64 val) { } // Rewrite a function call to __tls_get_addr to a cheaper instruction -// sequence. The difference from relax_gd_to_le is that we are -// materializing a Dynamic Thread Pointer for the current ELF module -// instead of an address for a particular thread-local variable. -static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 tls_size) { +// sequence. The difference from relax_gd_to_le is that we are materializing +// the address of the beginning of TLS block instead of an address of a +// particular thread-local variable. +static void relax_ld_to_le(u8 *loc, ElfRel rel, i64 tls_size) { switch (rel.r_type) { case R_X86_64_PLT32: case R_X86_64_PC32: { @@ -267,10 +313,9 @@ static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 tls_size) { // 48 8d 3d 00 00 00 00 lea foo@tlsld(%rip), %rdi // e8 00 00 00 00 call __tls_get_addr // - // The instructions are so short that we cannot rewrite them with - // "mov %fs:0, %rax" which is 9 bytes long. We use a shorter code - // sequence instead. Since "xor %eax, %eax" zero-clears %rax, the - // meaning is equivalent. + // Because the original instruction sequence is so short that we need a + // little bit of code golfing here. "mov %fs:0, %rax" is 9 byte long, so + // xor + mov is shorter. Note that `xor %eax, %eax` zero-clears %eax. static const u8 insn[] = { 0x31, 0xc0, // xor %eax, %eax 0x64, 0x48, 0x8b, 0x00, // mov %fs:(%rax), %rax @@ -287,13 +332,12 @@ static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 tls_size) { // 48 8d 3d 00 00 00 00 lea foo@tlsld(%rip), %rdi // ff 15 00 00 00 00 call *__tls_get_addr@GOT(%rip) static const u8 insn[] = { - 0x31, 0xc0, // xor %eax, %eax + 0x48, 0x31, 0xc0, // xor %rax, %rax 0x64, 0x48, 0x8b, 0x00, // mov %fs:(%rax), %rax 0x48, 0x2d, 0, 0, 0, 0, // sub $tls_size, %rax - 0x90, // nop }; memcpy(loc - 3, insn, sizeof(insn)); - *(ul32 *)(loc + 5) = tls_size; + *(ul32 *)(loc + 6) = tls_size; break; } case R_X86_64_PLTOFF64: { @@ -324,11 +368,6 @@ template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) @@ -376,7 +415,6 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { write32s(S + A); break; case R_X86_64_64: - apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel); break; case R_X86_64_PC8: check(S + A - P, -(1 << 7), 1 << 7); @@ -492,28 +530,43 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // call *(%rax) // R_X86_64_TLSDESC_CALL foo // - // We may relax the instructions to the following for non-dlopen'd DSO + // We may relax the instructions to the following if its TP-relative + // address is known at link-time // - // mov foo@GOTTPOFF(%rip), %rax + // mov $foo@TPOFF, %rax // nop // - // or to the following for executable. + // or to the following if the TP-relative address is known at + // process startup time. // - // mov $foo@TPOFF, %rax + // mov foo@GOTTPOFF(%rip), %rax // nop + // + // We allow the following alternative code sequence too because + // LLVM emits such code. + // + // lea 0(%rip), %reg + // R_X86_64_GOTPC32_TLSDESC foo + // mov %reg, %rax + // call *(%rax) + // R_X86_64_TLSDESC_CALL foo if (sym.has_tlsdesc(ctx)) { write32s(sym.get_tlsdesc_addr(ctx) + A - P); } else if (sym.has_gottp(ctx)) { - // mov foo@gottpoff(%rip), %rax - loc[-3] = 0x48; - loc[-2] = 0x8b; - loc[-1] = 0x05; + u32 insn = relax_tlsdesc_to_ie(loc - 3); + if (!insn) + Fatal(ctx) << *this << ": illegal instruction sequence for TLSDESC"; + loc[-3] = insn >> 16; + loc[-2] = insn >> 8; + loc[-1] = insn; write32s(sym.get_gottp_addr(ctx) + A - P); } else { - // mov $foo@tpoff, %rax - loc[-3] = 0x48; - loc[-2] = 0xc7; - loc[-1] = 0xc0; + u32 insn = relax_tlsdesc_to_le(loc - 3); + if (!insn) + Fatal(ctx) << *this << ": illegal instruction sequence for TLSDESC"; + loc[-3] = insn >> 16; + loc[-2] = insn >> 8; + loc[-1] = insn; write32s(S - ctx.tp_addr); } break; @@ -650,8 +703,6 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); std::span> rels = get_rels(ctx); // Scan relocations @@ -686,9 +737,6 @@ void InputSection::scan_relocations(Context &ctx) { case R_X86_64_32S: scan_absrel(ctx, sym, rel); break; - case R_X86_64_64: - scan_dyn_absrel(ctx, sym, rel); - break; case R_X86_64_PC8: case R_X86_64_PC16: case R_X86_64_PC32: @@ -711,8 +759,7 @@ void InputSection::scan_relocations(Context &ctx) { sym.flags |= NEEDS_PLT; break; case R_X86_64_TLSGD: - if ((ctx.arg.relax && sym.is_tprel_linktime_const(ctx)) || - ctx.arg.is_static) { + if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) { // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). i++; @@ -726,7 +773,7 @@ void InputSection::scan_relocations(Context &ctx) { case R_X86_64_TLSLD: // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). - if (ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared)) + if (ctx.arg.static_ || (ctx.arg.relax && !ctx.arg.shared)) i++; else ctx.needs_tlsld = true; @@ -746,6 +793,7 @@ void InputSection::scan_relocations(Context &ctx) { case R_X86_64_TPOFF64: check_tlsle(ctx, sym, rel); break; + case R_X86_64_64: case R_X86_64_GOTOFF64: case R_X86_64_DTPOFF32: case R_X86_64_DTPOFF64: @@ -759,4 +807,95 @@ void InputSection::scan_relocations(Context &ctx) { } } -} // namespace mold::elf +// Intel CET is a relatively new CPU feature to enhance security by +// protecting control flow integrity. If the feature is enabled, indirect +// branches (i.e. branch instructions that take a register instead of an +// immediate) must land on a "landing pad" instruction, or a CPU-level fault +// will raise. That prevents an attacker to branch to a middle of a random +// function, making ROP or JOP much harder to conduct. +// +// On x86-64, the landing pad instruction is ENDBR64. That is actually a +// repurposed NOP instruction to provide binary compatibility with older +// hardware that doesn't support CET. +// +// The problem here is that the compiler always emits a landing pad at the +// beginning fo a global function because it doesn't know whether or not the +// function's address is taken in other translation units. As a result, the +// resulting binary contains more landing pads than necessary. +// +// This function rewrites a landing pad with a nop if the function's address +// was not actually taken. We can do what the compiler cannot because we +// know about all translation units. +void rewrite_endbr(Context &ctx) { + Timer t(ctx, "rewrite_endbr"); + + constexpr u8 endbr64[] = {0xf3, 0x0f, 0x1e, 0xfa}; + constexpr u8 nop[] = {0x0f, 0x1f, 0x40, 0x00}; + + // Rewrite all endbr64 instructions referred to by function symbols with + // NOPs. We handle only global symbols because the compiler doesn't emit + // an endbr64 for a file-scoped function in the first place if its address + // is not taken within the file. + tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { + for (Symbol *sym : file->get_global_syms()) { + if (sym->file == file && sym->esym().st_type == STT_FUNC) { + if (InputSection *isec = sym->get_input_section(); + isec && (isec->shdr().sh_flags & SHF_EXECINSTR)) { + if (OutputSection *osec = isec->output_section) { + u8 *buf = ctx.buf + osec->shdr.sh_offset + isec->offset + sym->value; + if (memcmp(buf, endbr64, 4) == 0) + memcpy(buf, nop, 4); + } + } + } + } + }); + + auto write_back = [&](InputSection *isec, i64 offset) { + // If isec has an endbr64 at a given offset, copy that instruction to + // the output buffer, possibly overwriting a nop written in the above + // loop. + if (isec && isec->output_section && + (isec->shdr().sh_flags & SHF_EXECINSTR) && + 0 <= offset && offset <= isec->contents.size() - 4 && + memcmp(isec->contents.data() + offset, endbr64, 4) == 0) + memcpy(ctx.buf + isec->output_section->shdr.sh_offset + isec->offset + offset, + endbr64, 4); + }; + + // Write back endbr64 instructions if they are referred to by address-taking + // relocations. + tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { + for (std::unique_ptr> &isec : file->sections) { + if (isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC)) { + for (const ElfRel &rel : isec->get_rels(ctx)) { + if (!is_func_call_rel(rel)) { + Symbol *sym = file->symbols[rel.r_sym]; + if (sym->esym().st_type == STT_SECTION) + write_back(sym->get_input_section(), rel.r_addend); + else + write_back(sym->get_input_section(), sym->value); + } + } + } + } + }); + + // We record addresses of some symbols in the ELF header, .dynamic or in + // .dynsym. We need to retain endbr64s for such symbols. + auto keep = [&](Symbol *sym) { + if (sym) + write_back(sym->get_input_section(), sym->value); + }; + + keep(ctx.arg.entry); + keep(ctx.arg.init); + keep(ctx.arg.fini); + + if (ctx.dynsym) + for (Symbol *sym : ctx.dynsym->symbols) + if (sym && sym->is_exported) + keep(sym); +} + +} // namespace mold diff --git a/elf/cmdline.cc b/src/cmdline.cc similarity index 77% rename from elf/cmdline.cc rename to src/cmdline.cc index 0e587b54..bdb79ed9 100644 --- a/elf/cmdline.cc +++ b/src/cmdline.cc @@ -1,21 +1,21 @@ #include "mold.h" -#include "../common/cmdline.h" #include #include #include #include #include +#include #include -#ifdef _WIN32 -# define _isatty isatty -# define STDERR_FILENO (_fileno(stderr)) -#else +#if __has_include() # include +#else +# define isatty _isatty +# define STDERR_FILENO (_fileno(stderr)) #endif -namespace mold::elf { +namespace mold { inline const char helpmsg[] = R"( Options: @@ -34,7 +34,7 @@ inline const char helpmsg[] = R"( --no-dynamic-linker -L DIR, --library-path DIR Add DIR to library search path -M, --print-map Write map file to stdout - -N, --omagic Do not page align data, do not make text readonly + -N, --omagic Do not page align data; do not make text readonly --no-omagic -O NUMBER Ignored -S, --strip-debug Strip .debug_* sections @@ -44,7 +44,8 @@ inline const char helpmsg[] = R"( -f SHLIB, --auxiliary SHLIB Set DT_AUXILIARY to the specified value -h LIBNAME, --soname LIBNAME Set shared library name - -l LIBNAME Search for a given library + -l LIBNAME, --library LIBNAME + Search for a given library -m TARGET Set target -o FILE, --output FILE Set output filename -q, --emit-relocs Leaves relocation sections in the output @@ -52,24 +53,29 @@ inline const char helpmsg[] = R"( -s, --strip-all Strip .symtab section -u SYMBOL, --undefined SYMBOL Force to resolve SYMBOL + -y SYMBOL, --trace-symbol SYMBOL + Trace references to SYMBOL --Bdynamic, --dy Link against shared libraries (default) --Bstatic, --dn, --static Do not link against shared libraries - --Bsymbolic Bind global symbols locally - --Bsymbolic-functions Bind global functions locally - --Bno-symbolic Cancel --Bsymbolic and --Bsymbolic-functions + --Bsymbolic Bind all symbols locally + --Bsymbolic-functions Bind function symbols locally + --Bsymbolic-non-weak Bind all but weak symbols locally + --Bsymbolic-non-weak-functions + Bind all but weak function symbols locally + --Bno-symbolic Cancel --Bsymbolic options --Map FILE Write map file to a given file --Tbss=ADDR Set address to .bss - --Tdata Set address to .data - --Ttext Set address to .text + --Tdata=ADDR Set address to .data + --Ttext=ADDR Set address to .text --allow-multiple-definition Allow multiple definitions - --apply-dynamic-relocs Apply link-time values for dynamic relocations (defualt) + --apply-dynamic-relocs Apply link-time values for dynamic relocations (default) --no-apply-dynamic-relocs --as-needed Only set DT_NEEDED if used --no-as-needed - --build-id [none,md5,sha1,sha256,uuid,HEXSTRING] + --build-id [none,md5,sha1,sha256,fast,uuid,HEXSTRING] Generate build ID --no-build-id - --chroot DIR Set a given path to root directory + --chroot DIR Set a given path to the root directory --color-diagnostics=[auto,always,never] Use colors in diagnostics --color-diagnostics Alias for --color-diagnostics=always @@ -80,16 +86,19 @@ inline const char helpmsg[] = R"( --defsym=SYMBOL=VALUE Define a symbol alias --demangle Demangle C++ symbols in log messages (default) --no-demangle + --detach Create separate debug info file in the background (default) + --no-detach --enable-new-dtags Emit DT_RUNPATH for --rpath (default) --disable-new-dtags Emit DT_RPATH for --rpath --execute-only Make executable segments unreadable --dp Ignored - --dynamic-list Read a list of dynamic symbols (implies -Bsymbolic) + --dynamic-list=FILE Read a list of dynamic symbols (implies -Bsymbolic) + --dynamic-list-data Add data symbols to dynamic symbols --eh-frame-hdr Create .eh_frame_hdr section --no-eh-frame-hdr - --exclude-libs LIB,LIB,.. Mark all symbols in given libraries hidden + --exclude-libs LIB,LIB,.. Mark all symbols in given libraries as hidden --export-dynamic-symbol Put symbols matching glob in the dynamic symbol table - --export-dynamic-symbol-list + --export-dynamic-symbol-list=FILE Read a list of dynamic symbols --fatal-warnings Treat warnings as errors --no-fatal-warnings Do not treat warnings as errors (default) @@ -107,54 +116,61 @@ inline const char helpmsg[] = R"( Allow merging non-executable sections with --icf --image-base ADDR Set the base address to a given value --init SYMBOL Call SYMBOL at load-time + --nmagic Do not page align sections + --no-nmagic --no-undefined Report undefined symbols (even with --shared) --noinhibit-exec Create an output file even if errors occur - --oformat=binary Omit ELF, section and program headers + --oformat=binary Omit ELF, section, and program headers --pack-dyn-relocs=[relr,none] Pack dynamic relocations --package-metadata=STRING Set a given string to .note.package --perf Print performance statistics - --pie, --pic-executable Create a position independent executable + --pie, --pic-executable Create a position-independent executable --no-pie, --no-pic-executable - --pop-state Restore state of flags governing input file handling + --pop-state Restore the state of flags governing input file handling --print-gc-sections Print removed unreferenced sections --no-print-gc-sections --print-icf-sections Print folded identical sections --no-print-icf-sections - --push-state Save state of flags governing input file handling + --push-state Save the state of flags governing input file handling --quick-exit Use quick_exit to exit (default) --no-quick-exit --relax Optimize instructions (default) --no-relax - --repro Embed input files to .repro section + --repro Embed input files in .repro section --require-defined SYMBOL Require SYMBOL be defined in the final output --retain-symbols-file FILE Keep only symbols listed in FILE - --reverse-sections Reverses input sections in the output file + --reverse-sections Reverse input sections in the output file --rosegment Put read-only non-executable sections in their own segment (default) --no-rosegment Put read-only non-executable sections in an executable segment - --rpath DIR Add DIR to runtime search path + --rpath DIR Add DIR to the runtime search path --rpath-link DIR Ignored --run COMMAND ARG... Run COMMAND with mold as /usr/bin/ld - --section-start=SECTION=ADDR Set address to section - --shared, --Bshareable Create a share library + --section-start=SECTION=ADDR Set address for section + --separate-debug-file[=FILE] Separate debug info to the specified file + --no-separate-debug-file + --shared, --Bshareable Create a shared library --shuffle-sections[=SEED] Randomize the output by shuffling input sections --sort-common Ignored --sort-section Ignored - --spare-dynamic-tags NUMBER Reserve give number of tags in .dynamic section + --spare-dynamic-tags NUMBER Reserve the given number of tags in the .dynamic section + --spare-program-headers NUMBER + Reserve the given number of slots in the program header --start-lib Give following object files in-archive-file semantics --end-lib End the effect of --start-lib --stats Print input statistics - --sysroot DIR Set target system root directory + --sysroot DIR Set the target system root directory --thread-count COUNT, --threads=COUNT Use COUNT number of threads --threads Use multiple threads (default) --no-threads - --trace Print name of each input file - --undefined-version Do not report version scripts that refer undefined symbols - --no-undefined-version Report version scripts that refer undefined symbols (default) + --trace Print the name of each input file + --undefined-glob PATTERN Force to resolve all symbols that match a given pattern + --undefined-version Do not report version scripts that refer to undefined symbols + --no-undefined-version Report version scripts that refer to undefined symbols (default) --unique PATTERN Don't merge input sections that match a given pattern --unresolved-symbols [report-all,ignore-all,ignore-in-object-files,ignore-in-shared-libs] - How to handle unresolved symbols + Handle unresolved symbols --version-script FILE Read version script --warn-common Warn about common symbols --no-warn-common @@ -166,21 +182,21 @@ inline const char helpmsg[] = R"( Report unresolved symbols as errors (default) --whole-archive Include all objects from static archives --no-whole-archive - --wrap SYMBOL Use wrapper function for a given symbol + --wrap SYMBOL Use a wrapper function for a given symbol -z defs Report undefined symbols (even with --shared) -z nodefs -z common-page-size=VALUE Ignored - -z execstack Require executable stack + -z execstack Require an executable stack -z noexecstack - -z execstack-if-needed Make the stack area execuable if an input file explicitly requests it + -z execstack-if-needed Make the stack area executable if an input file explicitly requests it -z initfirst Mark DSO to be initialized first at runtime - -z interpose Mark object to interpose all DSOs but executable + -z interpose Mark object to interpose all DSOs but the executable -z keep-text-section-prefix Keep .text.{hot,unknown,unlikely,startup,exit} as separate sections in the final binary -z nokeep-text-section-prefix -z lazy Enable lazy function resolution (default) -z max-page-size=VALUE Use VALUE as the memory page size -z nocopyreloc Do not create copy relocations - -z nodefaultlib Make the dynamic loader to ignore default search paths + -z nodefaultlib Make the dynamic loader ignore default search paths -z nodelete Mark DSO non-deletable at runtime -z nodlopen Mark DSO not available to dlopen -z nodump Mark DSO not available to dldump @@ -190,19 +206,128 @@ inline const char helpmsg[] = R"( -z nopack-relative-relocs -z sectionheader Do not omit section header (default) -z nosectionheader Omit section header + -z start_stop_visibility=[hidden,protected] + Specify symbol visibility for "__start_SECNAME" and "__stop_SECNAME" symbols -z separate-loadable-segments - Separate all loadable segments to different pages - -z separate-code Separate code and data into different pages + Separate all loadable segments onto different pages + -z separate-code Separate code and data onto different pages -z noseparate-code Allow overlap in pages - -z stack-size=VALUE Set size of stack segment + -z stack-size=VALUE Set the size of the stack segment -z relro Make some sections read-only after relocation (default) -z norelro + -z rewrite-endbr Rewrite indirect branch target instructions with NOPs + -z norewrite-endbr + -z rodynamic Make the .dynamic section read-only -z text Report error if DT_TEXTREL is set -z notext -z textoff -mold: supported targets: elf32-i386 elf64-x86-64 elf32-littlearm elf64-littleaarch64 elf32-littleriscv elf32-bigriscv elf64-littleriscv elf64-bigriscv elf32-powerpc elf64-powerpc elf64-powerpc elf64-powerpcle elf64-s390 elf64-sparc elf32-m68k elf32-sh-linux elf64-alpha elf64-loongarch elf32-loongarch -mold: supported emulations: elf_i386 elf_x86_64 armelf_linux_eabi aarch64linux aarch64elf elf32lriscv elf32briscv elf64lriscv elf64briscv elf32ppc elf32ppclinux elf64ppc elf64lppc elf64_s390 elf64_sparc m68kelf shlelf_linux elf64alpha elf64loongarch elf32loongarch)"; +mold: supported targets: elf32-i386 elf64-x86-64 elf32-littlearm elf64-littleaarch64 elf32-littleriscv elf32-bigriscv elf64-littleriscv elf64-bigriscv elf32-powerpc elf64-powerpc elf64-powerpc elf64-powerpcle elf64-s390 elf64-sparc elf32-m68k elf32-sh-linux elf64-loongarch elf32-loongarch +mold: supported emulations: elf_i386 elf_x86_64 armelf_linux_eabi aarch64linux aarch64elf elf32lriscv elf32briscv elf64lriscv elf64briscv elf32ppc elf32ppclinux elf64ppc elf64lppc elf64_s390 elf64_sparc m68kelf shlelf_linux elf64loongarch elf32loongarch)"; + +template +static std::vector +read_response_file(Context &ctx, std::string_view path, i64 depth) { + if (depth > 10) + Fatal(ctx) << path << ": response file nesting too deep"; + + std::vector vec; + MappedFile *mf = must_open_file(ctx, std::string(path)); + std::string_view data((char *)mf->data, mf->size); + + mf->is_dependency = false; + + while (!data.empty()) { + if (isspace(data[0])) { + data = data.substr(1); + continue; + } + + auto read_quoted = [&]() { + char quote = data[0]; + data = data.substr(1); + + std::string buf; + while (!data.empty() && data[0] != quote) { + if (data[0] == '\\' && data.size() >= 1) { + buf.append(1, data[1]); + data = data.substr(2); + } else { + buf.append(1, data[0]); + data = data.substr(1); + } + } + if (data.empty()) + Fatal(ctx) << path << ": premature end of input"; + data = data.substr(1); + return save_string(ctx, buf); + }; + + auto read_unquoted = [&] { + std::string buf; + while (!data.empty()) { + if (data[0] == '\\' && data.size() >= 1) { + buf.append(1, data[1]); + data = data.substr(2); + continue; + } + + if (!isspace(data[0])) { + buf.append(1, data[0]); + data = data.substr(1); + continue; + } + break; + } + return save_string(ctx, buf); + }; + + std::string_view tok; + if (data[0] == '\'' || data[0] == '\"') + tok = read_quoted(); + else + tok = read_unquoted(); + + if (tok.starts_with('@')) + append(vec, read_response_file(ctx, tok.substr(1), depth + 1)); + else + vec.push_back(tok); + } + return vec; +} + +// Replace "@path/to/some/text/file" with its file contents. +template +std::vector +expand_response_files(Context &ctx, char **argv) { + std::vector vec; + for (i64 i = 0; argv[i]; i++) { + if (argv[i][0] == '@') + append(vec, read_response_file(ctx, argv[i] + 1, 1)); + else + vec.push_back(argv[i]); + } + return vec; +} + +static i64 get_default_thread_count() { + // mold doesn't scale well above 32 threads. + int n = tbb::global_control::active_value( + tbb::global_control::max_allowed_parallelism); + return std::min(n, 32); +} + +static inline std::string_view string_trim(std::string_view str) { + size_t pos = str.find_first_not_of(" \t"); + if (pos == str.npos) + return ""; + str = str.substr(pos); + + pos = str.find_last_not_of(" \t"); + if (pos == str.npos) + return str; + return str.substr(0, pos + 1); +} static std::vector add_dashes(std::string name) { // Single-letter option @@ -248,6 +373,15 @@ static i64 parse_number(Context &ctx, std::string opt, return ret; } +static char from_hex(char c) { + if ('0' <= c && c <= '9') + return c - '0'; + if ('a' <= c && c <= 'f') + return c - 'a' + 10; + assert('A' <= c && c <= 'F'); + return c - 'A' + 10; +} + template static std::vector parse_hex_build_id(Context &ctx, std::string_view arg) { auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript; @@ -256,23 +390,34 @@ static std::vector parse_hex_build_id(Context &ctx, std::string_view arg) if (!std::regex_match(arg.begin(), arg.end(), re)) Fatal(ctx) << "invalid build-id: " << arg; - arg = arg.substr(2); - - auto fn = [](char c) { - if ('0' <= c && c <= '9') - return c - '0'; - if ('a' <= c && c <= 'f') - return c - 'a' + 10; - assert('A' <= c && c <= 'F'); - return c - 'A' + 10; - }; - std::vector vec; - for (i64 i = 0; i < arg.size(); i += 2) - vec.push_back((fn(arg[i]) << 4) | fn(arg[i + 1])); + for (i64 i = 2; i < arg.size(); i += 2) + vec.push_back((from_hex(arg[i]) << 4) | from_hex(arg[i + 1])); return vec; } +template +static std::string +parse_encoded_package_metadata(Context &ctx, std::string_view arg) { + auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript; + static std::regex re(R"(([^%]|%[0-9a-fA-F][0-9a-fA-F])*)", flags); + + if (!std::regex_match(arg.begin(), arg.end(), re)) + Fatal(ctx) << "--encoded-package-metadata: invalid string: " << arg; + + std::ostringstream out; + while (!arg.empty()) { + if (arg[0] == '%') { + out << (char)((from_hex(arg[1]) << 4) | from_hex(arg[2])); + arg = arg.substr(3); + } else { + out << arg[0]; + arg = arg.substr(1); + } + } + return out.str(); +} + static std::vector split_by_comma_or_colon(std::string_view str) { std::vector vec; @@ -284,18 +429,16 @@ split_by_comma_or_colon(std::string_view str) { break; } vec.push_back(str.substr(0, pos)); - str = str.substr(pos); + str = str.substr(pos + 1); } return vec; } template static void read_retain_symbols_file(Context &ctx, std::string_view path) { - MappedFile> *mf = - MappedFile>::must_open(ctx, std::string(path)); + MappedFile *mf = must_open_file(ctx, std::string(path)); std::string_view data((char *)mf->data, mf->size); - - ctx.arg.retain_symbols_file.reset(new std::unordered_set); + std::vector *> vec; while (!data.empty()) { size_t pos = data.find('\n'); @@ -311,8 +454,10 @@ static void read_retain_symbols_file(Context &ctx, std::string_view path) { name = string_trim(name); if (!name.empty()) - ctx.arg.retain_symbols_file->insert(name); + vec.push_back(get_symbol(ctx, name)); } + + ctx.arg.retain_symbols_file = std::move(vec); } static bool is_file(std::string_view path) { @@ -406,10 +551,14 @@ std::vector parse_nonpositional_args(Context &ctx) { bool version_shown = false; bool warn_shared_textrel = false; + bool error_unresolved_symbols = true; std::optional z_separate_code; + std::optional report_undefined; std::optional z_relro; + std::optional separate_debug_file; std::optional shuffle_sections_seed; std::unordered_set rpaths; + std::vector version_scripts; auto add_rpath = [&](std::string_view arg) { if (rpaths.insert(arg).second) { @@ -419,19 +568,27 @@ std::vector parse_nonpositional_args(Context &ctx) { } }; - // RISC-V object files contains lots of local symbols, so by default - // we discard them. This is compatible with GNU ld. - if constexpr (is_riscv) + // RISC-V and LoongArch object files contains lots of local symbols, + // so by default we discard them. This is compatible with GNU ld. + if constexpr (is_riscv || is_loongarch) ctx.arg.discard_locals = true; - // It looks like the SPARC's dynamic linker takes both RELA's r_addend - // and the value at the relocated place. So we don't want to write - // values to relocated places. - if constexpr (is_sparc) - ctx.arg.apply_dynamic_relocs = false; + // We generally don't need to write addends to relocated places if the + // relocation type is RELA because RELA records contain addends. + // However, there are too much code that wrongly assumes that addends + // are written to both RELA records and relocated places, so we write + // addends to relocated places by default. There are a few exceptions: + // + // - It looks like the SPARC's dynamic linker takes both RELA's r_addend + // and the value at the relocated place. So we don't want to write + // values to relocated places. + // + // - Static PIE binaries crash on startup in some RISC-V environment if + // we write addends to relocated places. + ctx.arg.apply_dynamic_relocs = !is_sparc && !is_riscv; auto read_arg = [&](std::string name) { - for (std::string opt : add_dashes(name)) { + for (const std::string &opt : add_dashes(name)) { if (args[0] == opt) { if (args.size() == 1) Fatal(ctx) << "option -" << name << ": argument missing"; @@ -451,7 +608,7 @@ std::vector parse_nonpositional_args(Context &ctx) { }; auto read_eq = [&](std::string name) { - for (std::string opt : add_dashes(name)) { + for (const std::string &opt : add_dashes(name)) { if (args[0].starts_with(opt + "=")) { arg = args[0].substr(opt.size() + 1); args = args.subspan(1); @@ -462,7 +619,7 @@ std::vector parse_nonpositional_args(Context &ctx) { }; auto read_flag = [&](std::string name) { - for (std::string opt : add_dashes(name)) { + for (const std::string &opt : add_dashes(name)) { if (args[0] == opt) { args = args.subspan(1); return true; @@ -501,8 +658,8 @@ std::vector parse_nonpositional_args(Context &ctx) { while (!args.empty()) { if (read_flag("help")) { - SyncOut(ctx) << "Usage: " << ctx.cmdline_args[0] - << " [options] file...\n" << helpmsg; + Out(ctx) << "Usage: " << ctx.cmdline_args[0] + << " [options] file...\n" << helpmsg; exit(0); } @@ -513,19 +670,19 @@ std::vector parse_nonpositional_args(Context &ctx) { } else if (read_flag("no-dynamic-linker")) { ctx.arg.dynamic_linker = ""; } else if (read_flag("v")) { - SyncOut(ctx) << mold_version; + Out(ctx) << get_mold_version(); version_shown = true; } else if (read_flag("version")) { - SyncOut(ctx) << mold_version; + Out(ctx) << get_mold_version(); exit(0); } else if (read_flag("V")) { - SyncOut(ctx) << mold_version - << "\n Supported emulations:\n elf_x86_64\n elf_i386\n" - << " aarch64linux\n armelf_linux_eabi\n elf64lriscv\n" - << " elf64briscv\n elf32lriscv\n elf32briscv\n" - << " elf32ppc\n elf64ppc\n elf64lppc\n elf64_s390\n" - << " elf64_sparc\n m68kelf\n shlelf_linux\n" - << " elf64alpha\n elf64loongarch\n elf32loongarch"; + Out(ctx) << get_mold_version() + << "\n Supported emulations:\n elf_x86_64\n elf_i386\n" + << " aarch64linux\n armelf_linux_eabi\n elf64lriscv\n" + << " elf64briscv\n elf32lriscv\n elf32briscv\n" + << " elf32ppc\n elf64ppc\n elf64lppc\n elf64_s390\n" + << " elf64_sparc\n m68kelf\n shlelf_linux\n" + << " elf64loongarch\n elf32loongarch"; version_shown = true; } else if (read_arg("m")) { if (arg == "elf_x86_64") { @@ -558,8 +715,6 @@ std::vector parse_nonpositional_args(Context &ctx) { ctx.arg.emulation = M68K::target_name; } else if (arg == "shlelf_linux") { ctx.arg.emulation = SH4::target_name; - } else if (arg == "elf64alpha") { - ctx.arg.emulation = ALPHA::target_name; } else if (arg == "elf64loongarch") { ctx.arg.emulation = LOONGARCH64::target_name; } else if (arg == "elf32loongarch") { @@ -574,19 +729,22 @@ std::vector parse_nonpositional_args(Context &ctx) { } else if (read_flag("no-export-dynamic")) { ctx.arg.export_dynamic = false; } else if (read_flag("Bsymbolic")) { - ctx.arg.Bsymbolic = true; + ctx.arg.Bsymbolic = BSYMBOLIC_ALL; } else if (read_flag("Bsymbolic-functions")) { - ctx.arg.Bsymbolic_functions = true; + ctx.arg.Bsymbolic = BSYMBOLIC_FUNCTIONS; + } else if (read_flag("Bsymbolic-non-weak")) { + ctx.arg.Bsymbolic = BSYMBOLIC_NON_WEAK; + } else if (read_flag("Bsymbolic-non-weak-functions")) { + ctx.arg.Bsymbolic = BSYMBOLIC_NON_WEAK_FUNCTIONS; } else if (read_flag("Bno-symbolic")) { - ctx.arg.Bsymbolic = false; - ctx.arg.Bsymbolic_functions = false; + ctx.arg.Bsymbolic = BSYMBOLIC_NONE; } else if (read_arg("exclude-libs")) { append(ctx.arg.exclude_libs, split_by_comma_or_colon(arg)); } else if (read_flag("q") || read_flag("emit-relocs")) { ctx.arg.emit_relocs = true; ctx.arg.discard_locals = false; } else if (read_arg("e") || read_arg("entry")) { - ctx.arg.entry = arg; + ctx.arg.entry = get_symbol(ctx, arg); } else if (read_arg("Map")) { ctx.arg.Map = arg; ctx.arg.print_map = true; @@ -595,15 +753,18 @@ std::vector parse_nonpositional_args(Context &ctx) { } else if (read_flag("print-map") || read_flag("M")) { ctx.arg.print_map = true; } else if (read_flag("Bstatic") || read_flag("dn") || read_flag("static")) { - ctx.arg.is_static = true; + ctx.arg.static_ = true; remaining.push_back("--Bstatic"); } else if (read_flag("Bdynamic") || read_flag("dy")) { - ctx.arg.is_static = false; + ctx.arg.static_ = false; remaining.push_back("--Bdynamic"); } else if (read_flag("shared") || read_flag("Bshareable")) { ctx.arg.shared = true; } else if (read_arg("spare-dynamic-tags")) { ctx.arg.spare_dynamic_tags = parse_number(ctx, "spare-dynamic-tags", arg); + } else if (read_arg("spare-program-headers")) { + ctx.arg.spare_program_headers + = parse_number(ctx, "spare-program-headers", arg); } else if (read_flag("start-lib")) { remaining.push_back("--start-lib"); } else if (read_flag("start-stop")) { @@ -624,6 +785,10 @@ std::vector parse_nonpositional_args(Context &ctx) { ctx.arg.demangle = true; } else if (read_flag("no-demangle")) { ctx.arg.demangle = false; + } else if (read_flag("detach")) { + ctx.arg.detach = true; + } else if (read_flag("no-detach")) { + ctx.arg.detach = false; } else if (read_flag("default-symver")) { ctx.arg.default_symver = true; } else if (read_flag("noinhibit-exec")) { @@ -654,19 +819,22 @@ std::vector parse_nonpositional_args(Context &ctx) { ctx.arg.unique = std::move(*pat); } else if (read_arg("unresolved-symbols")) { if (arg == "report-all" || arg == "ignore-in-shared-libs") - ctx.arg.unresolved_symbols = UNRESOLVED_ERROR; + report_undefined = true; else if (arg == "ignore-all" || arg == "ignore-in-object-files") - ctx.arg.unresolved_symbols = UNRESOLVED_IGNORE; + report_undefined = false; else Fatal(ctx) << "unknown --unresolved-symbols argument: " << arg; } else if (read_arg("undefined") || read_arg("u")) { ctx.arg.undefined.push_back(get_symbol(ctx, arg)); + } else if (read_arg("undefined-glob")) { + if (!ctx.arg.undefined_glob.add(arg, 0)) + Fatal(ctx) << "--undefined-glob: invalid pattern: " << arg; } else if (read_arg("require-defined")) { ctx.arg.require_defined.push_back(get_symbol(ctx, arg)); } else if (read_arg("init")) { - ctx.arg.init = arg; + ctx.arg.init = get_symbol(ctx, arg); } else if (read_arg("fini")) { - ctx.arg.fini = arg; + ctx.arg.fini = get_symbol(ctx, arg); } else if (read_arg("hash-style")) { if (arg == "sysv") { ctx.arg.hash_style_sysv = true; @@ -727,6 +895,8 @@ std::vector parse_nonpositional_args(Context &ctx) { } else if (read_flag("pack-dyn-relocs=none") || read_z_flag("nopack-relative-relocs")) { ctx.arg.pack_dyn_relocs_relr = false; + } else if (read_arg("encoded-package-metadata")) { + ctx.arg.package_metadata = parse_encoded_package_metadata(ctx, arg); } else if (read_arg("package-metadata")) { ctx.arg.package_metadata = arg; } else if (read_flag("stats")) { @@ -772,7 +942,7 @@ std::vector parse_nonpositional_args(Context &ctx) { ctx.arg.wrap.insert(arg); } else if (read_flag("omagic") || read_flag("N")) { ctx.arg.omagic = true; - ctx.arg.is_static = true; + ctx.arg.static_ = true; } else if (read_flag("no-omagic")) { ctx.arg.omagic = false; } else if (read_arg("oformat")) { @@ -823,19 +993,20 @@ std::vector parse_nonpositional_args(Context &ctx) { ctx.page_size = parse_number(ctx, "-z max-page-size", arg); if (!has_single_bit(ctx.page_size)) Fatal(ctx) << "-z max-page-size " << arg << ": value must be a power of 2"; - } else if (read_z_arg("start-stop-visibility")) { - if (arg != "hidden") - Fatal(ctx) << "-z start-stop-visibility: unsupported visibility: " << arg; + } else if (read_z_flag("start-stop-visibility=protected")) { + ctx.arg.z_start_stop_visibility_protected = true; + } else if (read_z_flag("start-stop-visibility=hidden")) { + ctx.arg.z_start_stop_visibility_protected = false; } else if (read_z_flag("noexecstack")) { ctx.arg.z_execstack = false; } else if (read_z_flag("relro")) { z_relro = true; } else if (read_z_flag("norelro")) { z_relro = false; - } else if (read_z_flag("defs")) { - ctx.arg.z_defs = true; + } else if (read_z_flag("defs") || read_flag("no-undefined")) { + report_undefined = true; } else if (read_z_flag("undefs")) { - ctx.arg.z_defs = false; + report_undefined = false; } else if (read_z_flag("nodlopen")) { ctx.arg.z_dlopen = false; } else if (read_z_flag("nodelete")) { @@ -867,6 +1038,12 @@ std::vector parse_nonpositional_args(Context &ctx) { ctx.arg.z_origin = true; } else if (read_z_flag("nodefaultlib")) { ctx.arg.z_nodefaultlib = true; + } else if (read_eq("separate-debug-file")) { + separate_debug_file = arg; + } else if (read_flag("separate-debug-file")) { + separate_debug_file = ""; + } else if (read_flag("no-separate-debug-file")) { + separate_debug_file.reset(); } else if (read_z_flag("separate-loadable-segments")) { z_separate_code = SEPARATE_LOADABLE_SEGMENTS; } else if (read_z_flag("separate-code")) { @@ -883,10 +1060,24 @@ std::vector parse_nonpositional_args(Context &ctx) { ctx.arg.z_sectionheader = true; } else if (read_z_flag("nosectionheader")) { ctx.arg.z_sectionheader = false; + } else if (read_z_flag("rodynamic")) { + ctx.arg.z_rodynamic = true; + } else if (read_z_flag("x86-64-v2")) { + ctx.arg.z_x86_64_isa_level |= GNU_PROPERTY_X86_ISA_1_V2; + } else if (read_z_flag("x86-64-v3")) { + ctx.arg.z_x86_64_isa_level |= GNU_PROPERTY_X86_ISA_1_V3; + } else if (read_z_flag("x86-64-v4")) { + ctx.arg.z_x86_64_isa_level |= GNU_PROPERTY_X86_ISA_1_V4; } else if (read_z_flag("rewrite-endbr")) { + if constexpr (!is_x86_64) + Fatal(ctx) << "-z rewrite-endbr is supported only on x86-64"; ctx.arg.z_rewrite_endbr = true; - } else if (read_flag("no-undefined")) { - ctx.arg.z_defs = true; + } else if (read_z_flag("norewrite-endbr")) { + ctx.arg.z_rewrite_endbr = false; + } else if (read_flag("nmagic")) { + ctx.arg.nmagic = true; + } else if (read_flag("no-nmagic")) { + ctx.arg.nmagic = false; } else if (read_flag("fatal-warnings")) { ctx.arg.fatal_warnings = true; } else if (read_flag("no-fatal-warnings")) { @@ -1012,9 +1203,9 @@ std::vector parse_nonpositional_args(Context &ctx) { } else if (read_flag("strip-debug") || read_flag("S")) { ctx.arg.strip_debug = true; } else if (read_flag("warn-unresolved-symbols")) { - ctx.arg.unresolved_symbols = UNRESOLVED_WARN; + error_unresolved_symbols = false; } else if (read_flag("error-unresolved-symbols")) { - ctx.arg.unresolved_symbols = UNRESOLVED_ERROR; + error_unresolved_symbols = true; } else if (read_arg("rpath")) { add_rpath(arg); } else if (read_arg("R")) { @@ -1040,7 +1231,7 @@ std::vector parse_nonpositional_args(Context &ctx) { } else if (arg == "sha1") { ctx.arg.build_id.kind = BuildId::HASH; ctx.arg.build_id.hash_size = 20; - } else if (arg == "sha256") { + } else if (arg == "sha256" || arg == "fast") { ctx.arg.build_id.kind = BuildId::HASH; ctx.arg.build_id.hash_size = 32; } else if (arg.starts_with("0x") || arg.starts_with("0X")) { @@ -1062,6 +1253,10 @@ std::vector parse_nonpositional_args(Context &ctx) { ctx.arg.auxiliary.push_back(arg); } else if (read_arg("filter") || read_arg("F")) { ctx.arg.filter.push_back(arg); + } else if (read_flag("allow-shlib-undefined")) { + ctx.arg.allow_shlib_undefined = true; + } else if (read_flag("no-allow-shlib-undefined")) { + ctx.arg.allow_shlib_undefined = false; } else if (read_arg("O")) { } else if (read_flag("EB")) { } else if (read_flag("EL")) { @@ -1079,8 +1274,6 @@ std::vector parse_nonpositional_args(Context &ctx) { } else if (read_flag("enable-new-dtags")) { } else if (read_flag("disable-new-dtags")) { } else if (read_flag("nostdlib")) { - } else if (read_flag("allow-shlib-undefined")) { - } else if (read_flag("no-allow-shlib-undefined")) { } else if (read_flag("no-add-needed")) { } else if (read_flag("no-call-graph-profile-sort")) { } else if (read_flag("no-copy-dt-needed-entries")) { @@ -1096,6 +1289,7 @@ std::vector parse_nonpositional_args(Context &ctx) { } else if (read_flag("warn-constructors")) { } else if (read_flag("warn-execstack")) { } else if (read_flag("no-warn-execstack")) { + } else if (read_flag("long-plt")) { } else if (read_flag("secure-plt")) { } else if (read_arg("rpath-link")) { } else if (read_z_flag("combreloc")) { @@ -1104,17 +1298,12 @@ std::vector parse_nonpositional_args(Context &ctx) { } else if (read_flag("no-keep-memory")) { } else if (read_arg("max-cache-size")) { } else if (read_arg("version-script")) { - // --version-script is treated as positional arguments even though - // they are actually not positional. This is because linker scripts - // (a positional argument) can also specify a version script, and - // it's better to consolidate parsing in read_input_files. In - // particular, version scripts can modify ctx.default_version which - // we initialize *after* parsing non-positional args, so the parsing - // cannot be done right here. - remaining.push_back("--version-script=" + std::string(arg)); + version_scripts.push_back(arg); } else if (read_arg("dynamic-list")) { - ctx.arg.Bsymbolic = true; + ctx.arg.Bsymbolic = BSYMBOLIC_ALL; append(ctx.dynamic_list_patterns, parse_dynamic_list(ctx, arg)); + } else if (read_arg("dynamic-list-data")) { + ctx.arg.dynamic_list_data = true; } else if (read_arg("export-dynamic-symbol")) { ctx.dynamic_list_patterns.push_back({arg, ""}); } else if (read_arg("export-dynamic-symbol-list")) { @@ -1127,7 +1316,7 @@ std::vector parse_nonpositional_args(Context &ctx) { remaining.push_back("--whole-archive"); } else if (read_flag("no-whole-archive")) { remaining.push_back("--no-whole-archive"); - } else if (read_arg("l")) { + } else if (read_arg("l") || read_arg("library")) { remaining.push_back("-l" + std::string(arg)); } else if (read_arg("script") || read_arg("T")) { remaining.push_back(std::string(arg)); @@ -1145,7 +1334,7 @@ std::vector parse_nonpositional_args(Context &ctx) { Fatal(ctx) << "unknown command line option: -dynamic; -dynamic is a " << "macOS linker's option. mold does not support macOS."; } else { - if (args[0][0] == '-') + if (args[0].starts_with('-')) Fatal(ctx) << "unknown command line option: " << args[0]; remaining.push_back(std::string(args[0])); args = args.subspan(1); @@ -1172,13 +1361,25 @@ std::vector parse_nonpositional_args(Context &ctx) { if (ctx.arg.pic) ctx.arg.image_base = 0; + if (!report_undefined) + report_undefined = !ctx.arg.shared; + + if (*report_undefined) { + if (error_unresolved_symbols) + ctx.arg.unresolved_symbols = UNRESOLVED_ERROR; + else + ctx.arg.unresolved_symbols = UNRESOLVED_WARN; + } else { + ctx.arg.unresolved_symbols = UNRESOLVED_IGNORE; + } + if (ctx.arg.retain_symbols_file) { ctx.arg.strip_all = false; ctx.arg.discard_all = false; } if (ctx.arg.relocatable) - ctx.arg.is_static = true; + ctx.arg.static_ = true; if (ctx.arg.shuffle_sections == SHUFFLE_SECTIONS_SHUFFLE) { if (shuffle_sections_seed) @@ -1200,6 +1401,9 @@ std::vector parse_nonpositional_args(Context &ctx) { else if (!ctx.arg.section_order.empty()) ctx.arg.z_relro = false; + if (ctx.arg.nmagic) + ctx.arg.z_relro = false; + if (!ctx.arg.shared) { if (!ctx.arg.filter.empty()) Fatal(ctx) << "-filter may not be used without -shared"; @@ -1207,7 +1411,10 @@ std::vector parse_nonpositional_args(Context &ctx) { Fatal(ctx) << "-auxiliary may not be used without -shared"; } - if constexpr (!E::is_rela) + // Even though SH4 is RELA, addends in its relocation records are always + // zero, and actual addends are written to relocated places. So we need + // to handle it as an exception. + if constexpr (!E::is_rela || is_sh4) if (!ctx.arg.apply_dynamic_relocs) Fatal(ctx) << "--no-apply-dynamic-relocs may not be used on " << E::target_name; @@ -1235,10 +1442,36 @@ std::vector parse_nonpositional_args(Context &ctx) { ctx.default_version = VER_NDX_LAST_RESERVED + 1; } + for (std::string_view path : version_scripts) { + auto open = [&] { + if (MappedFile *mf = open_file(ctx, std::string(path))) + return mf; + for (std::string_view dir : ctx.arg.library_paths) + if (MappedFile *mf = + open_file(ctx, std::string(dir) + "/" + std::string(path))) + return mf; + Fatal(ctx) << "--version-script: file not found: " << path; + }; + + ReaderContext rctx; + Script(ctx, rctx, open()).parse_version_script(); + } + + if (separate_debug_file) { + if (separate_debug_file->empty()) + ctx.arg.separate_debug_file = ctx.arg.output + ".dbg"; + else + ctx.arg.separate_debug_file = *separate_debug_file; + } + if (ctx.arg.shared && warn_shared_textrel) ctx.arg.warn_textrel = true; - ctx.arg.undefined.push_back(get_symbol(ctx, ctx.arg.entry)); + // We don't want the background process to write to stdout + if (ctx.arg.stats || ctx.arg.perf) + ctx.arg.detach = false; + + ctx.arg.undefined.push_back(ctx.arg.entry); for (i64 i = 0; i < ctx.arg.defsyms.size(); i++) { std::variant *, u64> &val = ctx.arg.defsyms[i].second; @@ -1273,6 +1506,13 @@ std::vector parse_nonpositional_args(Context &ctx) { ctx.arg.dependency_file = ctx.arg.chroot + "/" + ctx.arg.dependency_file; } + // Mark GC root symbols + for (Symbol *sym : ctx.arg.undefined) + sym->gc_root = true; + for (Symbol *sym : ctx.arg.require_defined) + sym->gc_root = true; + ctx.arg.entry->gc_root = true; + if (version_shown && remaining.empty()) exit(0); return remaining; @@ -1280,6 +1520,7 @@ std::vector parse_nonpositional_args(Context &ctx) { using E = MOLD_TARGET; +template std::vector expand_response_files(Context &, char **); template std::vector parse_nonpositional_args(Context &ctx); -} // namespace mold::elf +} // namespace mold diff --git a/src/config.cc b/src/config.cc new file mode 100644 index 00000000..af578ab8 --- /dev/null +++ b/src/config.cc @@ -0,0 +1,13 @@ +#include "mold.h" +#include "config.h" + +namespace mold { + +std::string get_mold_version() { + if (mold_git_hash.empty()) + return "mold "s + MOLD_VERSION + " (compatible with GNU ld)"; + return "mold "s + MOLD_VERSION + " (" + mold_git_hash + + "; compatible with GNU ld)"; +} + +} // namespace mold diff --git a/elf/elf.cc b/src/elf.cc similarity index 96% rename from elf/elf.cc rename to src/elf.cc index 2ce2ec47..8f78df67 100644 --- a/elf/elf.cc +++ b/src/elf.cc @@ -1,6 +1,6 @@ -#include "mold.h" +#include "elf.h" -namespace mold::elf { +namespace mold { static std::string unknown_type(u32 r_type) { char buf[50]; @@ -890,46 +890,6 @@ std::string rel_to_string(u32 r_type) { return unknown_type(r_type); } -template <> -std::string rel_to_string(u32 r_type) { - switch (r_type) { - CASE(R_ALPHA_NONE); - CASE(R_ALPHA_REFLONG); - CASE(R_ALPHA_REFQUAD); - CASE(R_ALPHA_GPREL32); - CASE(R_ALPHA_LITERAL); - CASE(R_ALPHA_LITUSE); - CASE(R_ALPHA_GPDISP); - CASE(R_ALPHA_BRADDR); - CASE(R_ALPHA_HINT); - CASE(R_ALPHA_SREL16); - CASE(R_ALPHA_SREL32); - CASE(R_ALPHA_SREL64); - CASE(R_ALPHA_GPRELHIGH); - CASE(R_ALPHA_GPRELLOW); - CASE(R_ALPHA_GPREL16); - CASE(R_ALPHA_COPY); - CASE(R_ALPHA_GLOB_DAT); - CASE(R_ALPHA_JMP_SLOT); - CASE(R_ALPHA_RELATIVE); - CASE(R_ALPHA_BRSGP); - CASE(R_ALPHA_TLSGD); - CASE(R_ALPHA_TLSLDM); - CASE(R_ALPHA_DTPMOD64); - CASE(R_ALPHA_GOTDTPREL); - CASE(R_ALPHA_DTPREL64); - CASE(R_ALPHA_DTPRELHI); - CASE(R_ALPHA_DTPRELLO); - CASE(R_ALPHA_DTPREL16); - CASE(R_ALPHA_GOTTPREL); - CASE(R_ALPHA_TPREL64); - CASE(R_ALPHA_TPRELHI); - CASE(R_ALPHA_TPRELLO); - CASE(R_ALPHA_TPREL16); - } - return unknown_type(r_type); -} - template <> std::string rel_to_string(u32 r_type) { switch (r_type) { @@ -946,6 +906,8 @@ std::string rel_to_string(u32 r_type) { CASE(R_LARCH_TLS_TPREL32); CASE(R_LARCH_TLS_TPREL64); CASE(R_LARCH_IRELATIVE); + CASE(R_LARCH_TLS_DESC32); + CASE(R_LARCH_TLS_DESC64); CASE(R_LARCH_MARK_LA); CASE(R_LARCH_MARK_PCREL); CASE(R_LARCH_SOP_PUSH_PCREL); @@ -1031,6 +993,23 @@ std::string rel_to_string(u32 r_type) { CASE(R_LARCH_ADD_ULEB128); CASE(R_LARCH_SUB_ULEB128); CASE(R_LARCH_64_PCREL); + CASE(R_LARCH_CALL36); + CASE(R_LARCH_TLS_DESC_PC_HI20); + CASE(R_LARCH_TLS_DESC_PC_LO12); + CASE(R_LARCH_TLS_DESC64_PC_LO20); + CASE(R_LARCH_TLS_DESC64_PC_HI12); + CASE(R_LARCH_TLS_DESC_HI20); + CASE(R_LARCH_TLS_DESC_LO12); + CASE(R_LARCH_TLS_DESC64_LO20); + CASE(R_LARCH_TLS_DESC64_HI12); + CASE(R_LARCH_TLS_DESC_LD); + CASE(R_LARCH_TLS_DESC_CALL); + CASE(R_LARCH_TLS_LE_HI20_R); + CASE(R_LARCH_TLS_LE_ADD_R); + CASE(R_LARCH_TLS_LE_LO12_R); + CASE(R_LARCH_TLS_LD_PCREL20_S2); + CASE(R_LARCH_TLS_GD_PCREL20_S2); + CASE(R_LARCH_TLS_DESC_PCREL20_S2); } return unknown_type(r_type); } @@ -1040,4 +1019,4 @@ std::string rel_to_string(u32 r_type) { return rel_to_string(r_type); } -} // namespace mold::elf +} // namespace mold diff --git a/elf/elf.h b/src/elf.h similarity index 94% rename from elf/elf.h rename to src/elf.h index 4b8dc9fa..08ca6db2 100644 --- a/elf/elf.h +++ b/src/elf.h @@ -1,13 +1,13 @@ #pragma once -#include "../common/integers.h" +#include "../lib/integers.h" #include #include #include #include -namespace mold::elf { +namespace mold { struct X86_64; struct I386; @@ -24,7 +24,6 @@ struct S390X; struct SPARC64; struct M68K; struct SH4; -struct ALPHA; struct LOONGARCH64; struct LOONGARCH32; @@ -81,15 +80,19 @@ enum : u32 { SHT_GROUP = 17, SHT_SYMTAB_SHNDX = 18, SHT_RELR = 19, + SHT_LOOS = 0x60000000, SHT_LLVM_ADDRSIG = 0x6fff4c03, SHT_GNU_HASH = 0x6ffffff6, SHT_GNU_VERDEF = 0x6ffffffd, SHT_GNU_VERNEED = 0x6ffffffe, SHT_GNU_VERSYM = 0x6fffffff, + SHT_HIOS = 0x6fffffff, SHT_X86_64_UNWIND = 0x70000001, SHT_ARM_EXIDX = 0x70000001, SHT_ARM_ATTRIBUTES = 0x70000003, SHT_RISCV_ATTRIBUTES = 0x70000003, + SHT_LOUSER = 0x80000000, + SHT_HIUSER = 0xffffffff, }; enum : u32 { @@ -100,6 +103,7 @@ enum : u32 { SHF_STRINGS = 0x20, SHF_INFO_LINK = 0x40, SHF_LINK_ORDER = 0x80, + SHF_OS_NONCONFORMING = 0x100, SHF_GROUP = 0x200, SHF_TLS = 0x400, SHF_COMPRESSED = 0x800, @@ -186,6 +190,7 @@ enum : u32 { PT_GNU_EH_FRAME = 0x6474e550, PT_GNU_STACK = 0x6474e551, PT_GNU_RELRO = 0x6474e552, + PT_GNU_PROPERTY = 0x6474e553, PT_OPENBSD_RANDOMIZE = 0x65a3dbe6, PT_ARM_EXIDX = 0x70000001, PT_RISCV_ATTRIBUTES = 0x70000003, @@ -233,7 +238,6 @@ enum : u32 { EM_AARCH64 = 183, EM_RISCV = 243, EM_LOONGARCH = 258, - EM_ALPHA = 0x9026, }; enum : u32 { @@ -337,6 +341,12 @@ enum : u32 { GNU_PROPERTY_X86_FEATURE_1_IBT = 1, GNU_PROPERTY_X86_FEATURE_1_SHSTK = 2, GNU_PROPERTY_X86_FEATURE_1_AND = 0xc0000002, + + GNU_PROPERTY_X86_ISA_1_NEEDED = 0xc0008002, + GNU_PROPERTY_X86_ISA_1_BASELINE = 1, + GNU_PROPERTY_X86_ISA_1_V2 = 2, + GNU_PROPERTY_X86_ISA_1_V3 = 4, + GNU_PROPERTY_X86_ISA_1_V4 = 8, }; enum : u32 { @@ -374,8 +384,6 @@ enum : u32 { enum : u32 { STO_RISCV_VARIANT_CC = 0x80, - STO_ALPHA_NOPV = 0x20, - STO_ALPHA_STD_GPLOAD = 0x22, }; enum : u32 { @@ -1222,42 +1230,6 @@ enum : u32 { R_SH_GOTPLT32 = 168, }; -enum : u32 { - R_ALPHA_NONE = 0, - R_ALPHA_REFLONG = 1, - R_ALPHA_REFQUAD = 2, - R_ALPHA_GPREL32 = 3, - R_ALPHA_LITERAL = 4, - R_ALPHA_LITUSE = 5, - R_ALPHA_GPDISP = 6, - R_ALPHA_BRADDR = 7, - R_ALPHA_HINT = 8, - R_ALPHA_SREL16 = 9, - R_ALPHA_SREL32 = 10, - R_ALPHA_SREL64 = 11, - R_ALPHA_GPRELHIGH = 17, - R_ALPHA_GPRELLOW = 18, - R_ALPHA_GPREL16 = 19, - R_ALPHA_COPY = 24, - R_ALPHA_GLOB_DAT = 25, - R_ALPHA_JMP_SLOT = 26, - R_ALPHA_RELATIVE = 27, - R_ALPHA_BRSGP = 28, - R_ALPHA_TLSGD = 29, - R_ALPHA_TLSLDM = 30, - R_ALPHA_DTPMOD64 = 31, - R_ALPHA_GOTDTPREL = 32, - R_ALPHA_DTPREL64 = 33, - R_ALPHA_DTPRELHI = 34, - R_ALPHA_DTPRELLO = 35, - R_ALPHA_DTPREL16 = 36, - R_ALPHA_GOTTPREL = 37, - R_ALPHA_TPREL64 = 38, - R_ALPHA_TPRELHI = 39, - R_ALPHA_TPRELLO = 40, - R_ALPHA_TPREL16 = 41, -}; - enum : u32 { R_LARCH_NONE = 0, R_LARCH_32 = 1, @@ -1272,6 +1244,8 @@ enum : u32 { R_LARCH_TLS_TPREL32 = 10, R_LARCH_TLS_TPREL64 = 11, R_LARCH_IRELATIVE = 12, + R_LARCH_TLS_DESC32 = 13, + R_LARCH_TLS_DESC64 = 14, R_LARCH_MARK_LA = 20, R_LARCH_MARK_PCREL = 21, R_LARCH_SOP_PUSH_PCREL = 22, @@ -1357,6 +1331,23 @@ enum : u32 { R_LARCH_ADD_ULEB128 = 107, R_LARCH_SUB_ULEB128 = 108, R_LARCH_64_PCREL = 109, + R_LARCH_CALL36 = 110, + R_LARCH_TLS_DESC_PC_HI20 = 111, + R_LARCH_TLS_DESC_PC_LO12 = 112, + R_LARCH_TLS_DESC64_PC_LO20 = 113, + R_LARCH_TLS_DESC64_PC_HI12 = 114, + R_LARCH_TLS_DESC_HI20 = 115, + R_LARCH_TLS_DESC_LO12 = 116, + R_LARCH_TLS_DESC64_LO20 = 117, + R_LARCH_TLS_DESC64_HI12 = 118, + R_LARCH_TLS_DESC_LD = 119, + R_LARCH_TLS_DESC_CALL = 120, + R_LARCH_TLS_LE_HI20_R = 121, + R_LARCH_TLS_LE_ADD_R = 122, + R_LARCH_TLS_LE_LO12_R = 123, + R_LARCH_TLS_LD_PCREL20_S2 = 124, + R_LARCH_TLS_GD_PCREL20_S2 = 125, + R_LARCH_TLS_DESC_PCREL20_S2 = 126, }; // @@ -1740,7 +1731,7 @@ struct ElfSym { u8 st_bind : 4; u8 st_type : 4; u8 arm64_variant_pcs : 1; - u8 : 6; + u8 : 5; u8 st_visibility : 2; #endif @@ -1781,33 +1772,6 @@ struct ElfSym { ul64 st_size; }; -template <> -struct ElfSym { - bool is_undef() const { return st_shndx == SHN_UNDEF; } - bool is_abs() const { return st_shndx == SHN_ABS; } - bool is_common() const { return st_shndx == SHN_COMMON; } - bool is_weak() const { return st_bind == STB_WEAK; } - bool is_undef_weak() const { return is_undef() && is_weak(); } - - ul32 st_name; - -#ifdef __LITTLE_ENDIAN__ - u8 st_type : 4; - u8 st_bind : 4; - u8 st_visibility : 2; - u8 alpha_st_other : 6; // contains STO_ALPHA_NOPV, STO_ALPHA_STD_GPLOAD or 0 -#else - u8 st_bind : 4; - u8 st_type : 4; - u8 alpha_st_other : 6; - u8 st_visibility : 2; -#endif - - ul16 st_shndx; - ul64 st_value; - ul64 st_size; -}; - template <> struct ElfRel { ElfRel() = default; @@ -1822,6 +1786,22 @@ struct ElfRel { ib64 r_addend; }; +template <> +struct ElfRel { + ElfRel() = default; + + // Addend is ignored except for base relocations because even though + // SH4 is RELA, r_addend is ignored in most cases and works as if it + // were REL. + ElfRel(u64 offset, u32 type, u32 sym, i64 addend) + : r_offset(offset), r_type(type), r_sym(sym), r_addend(sym ? 0 : addend) {} + + ul32 r_offset; + u8 r_type; + ul24 r_sym; + il32 r_addend; +}; + // // Machine descriptions // @@ -1845,7 +1825,6 @@ template concept is_s390x = std::same_as; template concept is_sparc64 = std::same_as; template concept is_m68k = std::same_as; template concept is_sh4 = std::same_as; -template concept is_alpha = std::same_as; template concept is_loongarch64 = std::same_as; template concept is_loongarch32 = std::same_as; @@ -1869,7 +1848,7 @@ struct X86_64 { static constexpr u32 plt_hdr_size = 32; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 8; - static constexpr u8 filler[] = { 0xcc }; + static constexpr u8 filler[] = { 0xcc }; // int3 static constexpr u32 R_COPY = R_X86_64_COPY; static constexpr u32 R_GLOB_DAT = R_X86_64_GLOB_DAT; @@ -1894,7 +1873,7 @@ struct I386 { static constexpr u32 plt_hdr_size = 16; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; - static constexpr u8 filler[] = { 0xcc }; + static constexpr u8 filler[] = { 0xcc }; // int3 static constexpr u32 R_COPY = R_386_COPY; static constexpr u32 R_GLOB_DAT = R_386_GLOB_DAT; @@ -1921,7 +1900,7 @@ struct ARM64 { static constexpr u32 pltgot_size = 16; static constexpr u32 thunk_hdr_size = 0; static constexpr u32 thunk_size = 16; - static constexpr u8 filler[] = { 0x00, 0x7d, 0x20, 0xd4 }; + static constexpr u8 filler[] = { 0x00, 0x7d, 0x20, 0xd4 }; // brk static constexpr u32 R_COPY = R_AARCH64_COPY; static constexpr u32 R_GLOB_DAT = R_AARCH64_GLOB_DAT; @@ -1948,7 +1927,7 @@ struct ARM32 { static constexpr u32 pltgot_size = 16; static constexpr u32 thunk_hdr_size = 16; static constexpr u32 thunk_size = 16; - static constexpr u8 filler[] = { 0xff, 0xde }; + static constexpr u8 filler[] = { 0xff, 0xde }; // udf static constexpr u32 R_COPY = R_ARM_COPY; static constexpr u32 R_GLOB_DAT = R_ARM_GLOB_DAT; @@ -1974,7 +1953,7 @@ struct RV64 { static constexpr u32 plt_hdr_size = 32; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; - static constexpr u8 filler[] = { 0x02, 0x90 }; + static constexpr u8 filler[] = { 0x02, 0x90 }; // c.ebreak static constexpr u32 R_COPY = R_RISCV_COPY; static constexpr u32 R_GLOB_DAT = R_RISCV_64; @@ -2007,7 +1986,7 @@ struct RV32 { static constexpr u32 plt_hdr_size = 32; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; - static constexpr u8 filler[] = { 0x02, 0x90 }; + static constexpr u8 filler[] = { 0x02, 0x90 }; // c.ebreak static constexpr u32 R_COPY = R_RISCV_COPY; static constexpr u32 R_GLOB_DAT = R_RISCV_32; @@ -2044,7 +2023,7 @@ struct PPC32 { static constexpr u32 pltgot_size = 36; static constexpr u32 thunk_hdr_size = 0; static constexpr u32 thunk_size = 36; - static constexpr u8 filler[] = { 0x7f, 0xe0, 0x00, 0x08 }; + static constexpr u8 filler[] = { 0x7f, 0xe0, 0x00, 0x08 }; // trap static constexpr u32 R_COPY = R_PPC_COPY; static constexpr u32 R_GLOB_DAT = R_PPC_GLOB_DAT; @@ -2086,7 +2065,7 @@ struct PPC64V1 : PPC64 { static constexpr u32 pltgot_size = 0; static constexpr u32 thunk_hdr_size = 0; static constexpr u32 thunk_size = 28; - static constexpr u8 filler[] = { 0x7f, 0xe0, 0x00, 0x08 }; + static constexpr u8 filler[] = { 0x7f, 0xe0, 0x00, 0x08 }; // trap }; struct PPC64V2 : PPC64 { @@ -2097,7 +2076,7 @@ struct PPC64V2 : PPC64 { static constexpr u32 pltgot_size = 0; static constexpr u32 thunk_hdr_size = 0; static constexpr u32 thunk_size = 24; - static constexpr u8 filler[] = { 0x08, 0x00, 0xe0, 0x7f }; + static constexpr u8 filler[] = { 0x08, 0x00, 0xe0, 0x7f }; // trap }; struct S390X { @@ -2110,7 +2089,7 @@ struct S390X { static constexpr u32 plt_hdr_size = 48; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; - static constexpr u8 filler[] = { 0x07, 0x00 }; + static constexpr u8 filler[] = { 0x07, 0x00 }; // nop static constexpr u32 R_COPY = R_390_COPY; static constexpr u32 R_GLOB_DAT = R_390_GLOB_DAT; @@ -2134,7 +2113,7 @@ struct SPARC64 { static constexpr u32 plt_hdr_size = 128; static constexpr u32 plt_size = 32; static constexpr u32 pltgot_size = 32; - static constexpr u8 filler[] = { 0x91, 0xd0, 0x20, 0x05 }; + static constexpr u8 filler[] = { 0x91, 0xd0, 0x20, 0x05 }; // ta 5 static constexpr u32 R_COPY = R_SPARC_COPY; static constexpr u32 R_GLOB_DAT = R_SPARC_GLOB_DAT; @@ -2158,7 +2137,7 @@ struct M68K { static constexpr u32 plt_hdr_size = 18; static constexpr u32 plt_size = 14; static constexpr u32 pltgot_size = 8; - static constexpr u8 filler[] = { 0x4a, 0xfc }; + static constexpr u8 filler[] = { 0x4a, 0xfc }; // illegal static constexpr u32 R_COPY = R_68K_COPY; static constexpr u32 R_GLOB_DAT = R_68K_GLOB_DAT; @@ -2181,7 +2160,7 @@ struct SH4 { static constexpr u32 plt_hdr_size = 16; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 12; - static constexpr u8 filler[] = { 0x00, 0x00 }; + static constexpr u8 filler[] = { 0x00, 0x90 }; // nop static constexpr u32 R_COPY = R_SH_COPY; static constexpr u32 R_GLOB_DAT = R_SH_GLOB_DAT; @@ -2194,29 +2173,6 @@ struct SH4 { static constexpr u32 R_FUNCALL[] = { R_SH_PLT32 }; }; -struct ALPHA { - static constexpr std::string_view target_name = "alpha"; - static constexpr bool is_64 = true; - static constexpr bool is_le = true; - static constexpr bool is_rela = true; - static constexpr u32 page_size = 65536; - static constexpr u32 e_machine = EM_ALPHA; - static constexpr u32 plt_hdr_size = 0; - static constexpr u32 plt_size = 0; - static constexpr u32 pltgot_size = 0; - static constexpr u8 filler[] = { 0x81, 0x00, 0x00, 0x00 }; - - static constexpr u32 R_COPY = R_ALPHA_COPY; - static constexpr u32 R_GLOB_DAT = R_ALPHA_GLOB_DAT; - static constexpr u32 R_JUMP_SLOT = R_ALPHA_JMP_SLOT; - static constexpr u32 R_ABS = R_ALPHA_REFQUAD; - static constexpr u32 R_RELATIVE = R_ALPHA_RELATIVE; - static constexpr u32 R_DTPOFF = R_ALPHA_DTPREL64; - static constexpr u32 R_TPOFF = R_ALPHA_TPREL64; - static constexpr u32 R_DTPMOD = R_ALPHA_DTPMOD64; - static constexpr u32 R_FUNCALL[] = {}; -}; - struct LOONGARCH64 { static constexpr std::string_view target_name = "loongarch64"; static constexpr bool is_64 = true; @@ -2227,9 +2183,7 @@ struct LOONGARCH64 { static constexpr u32 plt_hdr_size = 32; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; - static constexpr u32 thunk_hdr_size = 0; - static constexpr u32 thunk_size = 8; - static constexpr u8 filler[] = { 0x00, 0x00, 0x2a, 0x00 }; + static constexpr u8 filler[] = { 0x00, 0x00, 0x2a, 0x00 }; // break 0 static constexpr u32 R_COPY = R_LARCH_COPY; static constexpr u32 R_GLOB_DAT = R_LARCH_64; @@ -2240,7 +2194,8 @@ struct LOONGARCH64 { static constexpr u32 R_DTPOFF = R_LARCH_TLS_DTPREL64; static constexpr u32 R_TPOFF = R_LARCH_TLS_TPREL64; static constexpr u32 R_DTPMOD = R_LARCH_TLS_DTPMOD64; - static constexpr u32 R_FUNCALL[] = { R_LARCH_B26 }; + static constexpr u32 R_TLSDESC = R_LARCH_TLS_DESC64; + static constexpr u32 R_FUNCALL[] = { R_LARCH_B26, R_LARCH_CALL36 }; }; struct LOONGARCH32 { @@ -2253,9 +2208,7 @@ struct LOONGARCH32 { static constexpr u32 plt_hdr_size = 32; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; - static constexpr u32 thunk_hdr_size = 0; - static constexpr u32 thunk_size = 8; - static constexpr u8 filler[] = { 0x00, 0x00, 0x2a, 0x00 }; + static constexpr u8 filler[] = { 0x00, 0x00, 0x2a, 0x00 }; // break 0 static constexpr u32 R_COPY = R_LARCH_COPY; static constexpr u32 R_GLOB_DAT = R_LARCH_32; @@ -2266,7 +2219,8 @@ struct LOONGARCH32 { static constexpr u32 R_DTPOFF = R_LARCH_TLS_DTPREL32; static constexpr u32 R_TPOFF = R_LARCH_TLS_TPREL32; static constexpr u32 R_DTPMOD = R_LARCH_TLS_DTPMOD32; - static constexpr u32 R_FUNCALL[] = { R_LARCH_B26 }; + static constexpr u32 R_TLSDESC = R_LARCH_TLS_DESC32; + static constexpr u32 R_FUNCALL[] = { R_LARCH_B26, R_LARCH_CALL36 }; }; -} // namespace mold::elf +} // namespace mold diff --git a/common/filetype.h b/src/filetype.h similarity index 61% rename from common/filetype.h rename to src/filetype.h index e49bec99..50b605da 100644 --- a/common/filetype.h +++ b/src/filetype.h @@ -1,7 +1,7 @@ #pragma once -#include "common.h" -#include "../elf/elf.h" +#include "../lib/common.h" +#include "elf.h" namespace mold { @@ -10,30 +10,25 @@ enum class FileType { EMPTY, ELF_OBJ, ELF_DSO, - MACH_OBJ, - MACH_EXE, - MACH_DYLIB, - MACH_BUNDLE, - MACH_UNIVERSAL, AR, THIN_AR, - TAPI, TEXT, GCC_LTO_OBJ, LLVM_BITCODE, }; -template -bool is_text_file(MappedFile *mf) { +inline bool is_text_file(MappedFile *mf) { + auto istext = [](char c) { + return isprint(c) || c == '\n' || c == '\t'; + }; + u8 *data = mf->data; - return mf->size >= 4 && isprint(data[0]) && isprint(data[1]) && - isprint(data[2]) && isprint(data[3]); + return mf->size >= 4 && istext(data[0]) && istext(data[1]) && + istext(data[2]) && istext(data[3]); } -template -inline bool is_gcc_lto_obj(Context &ctx, MappedFile *mf) { - using namespace mold::elf; - +template +inline bool is_gcc_lto_obj(MappedFile *mf, bool has_plugin) { const char *data = mf->get_contents().data(); ElfEhdr &ehdr = *(ElfEhdr *)data; ElfShdr *sh_begin = (ElfShdr *)(data + ehdr.e_shoff); @@ -50,10 +45,10 @@ inline bool is_gcc_lto_obj(Context &ctx, MappedFile *mf) { // the LTO linker plugin is available and falls back as regular // objects otherwise. GCC FAT LTO object can be identified by the // presence of `.gcc.lto_.symtab` section. - if (!ctx.arg.plugin.empty()) { + if (has_plugin) { std::string_view name = data + shdrs[shstrtab_idx].sh_offset + sec.sh_name; if (name.starts_with(".gnu.lto_.symtab.")) - return true; + return true; } if (sec.sh_type != SHT_SYMTAB) @@ -85,11 +80,10 @@ inline bool is_gcc_lto_obj(Context &ctx, MappedFile *mf) { return false; } -template -FileType get_file_type(Context &ctx, MappedFile *mf) { - using namespace elf; - +template +FileType get_file_type(Context &ctx, MappedFile *mf) { std::string_view data = mf->get_contents(); + bool has_plugin = !ctx.arg.plugin.empty(); if (data.empty()) return FileType::EMPTY; @@ -102,10 +96,10 @@ FileType get_file_type(Context &ctx, MappedFile *mf) { if (ehdr.e_type == ET_REL) { if (ehdr.e_ident[EI_CLASS] == ELFCLASS32) { - if (is_gcc_lto_obj(ctx, mf)) + if (is_gcc_lto_obj(mf, has_plugin)) return FileType::GCC_LTO_OBJ; } else { - if (is_gcc_lto_obj(ctx, mf)) + if (is_gcc_lto_obj(mf, has_plugin)) return FileType::GCC_LTO_OBJ; } return FileType::ELF_OBJ; @@ -118,10 +112,10 @@ FileType get_file_type(Context &ctx, MappedFile *mf) { if (ehdr.e_type == ET_REL) { if (ehdr.e_ident[EI_CLASS] == ELFCLASS32) { - if (is_gcc_lto_obj(ctx, mf)) + if (is_gcc_lto_obj(mf, has_plugin)) return FileType::GCC_LTO_OBJ; } else { - if (is_gcc_lto_obj(ctx, mf)) + if (is_gcc_lto_obj(mf, has_plugin)) return FileType::GCC_LTO_OBJ; } return FileType::ELF_OBJ; @@ -133,28 +127,10 @@ FileType get_file_type(Context &ctx, MappedFile *mf) { return FileType::UNKNOWN; } - if (data.starts_with("\xcf\xfa\xed\xfe")) { - switch (*(ul32 *)(data.data() + 12)) { - case 1: // MH_OBJECT - return FileType::MACH_OBJ; - case 2: // MH_EXECUTE - return FileType::MACH_EXE; - case 6: // MH_DYLIB - return FileType::MACH_DYLIB; - case 8: // MH_BUNDLE - return FileType::MACH_BUNDLE; - } - return FileType::UNKNOWN; - } - if (data.starts_with("!\n")) return FileType::AR; if (data.starts_with("!\n")) return FileType::THIN_AR; - if (data.starts_with("--- !tapi-tbd")) - return FileType::TAPI; - if (data.starts_with("\xca\xfe\xba\xbe")) - return FileType::MACH_UNIVERSAL; if (is_text_file(mf)) return FileType::TEXT; if (data.starts_with("\xde\xc0\x17\x0b")) @@ -164,29 +140,23 @@ FileType get_file_type(Context &ctx, MappedFile *mf) { return FileType::UNKNOWN; } -inline std::string filetype_to_string(FileType type) { - switch (type) { - case FileType::UNKNOWN: return "UNKNOWN"; - case FileType::EMPTY: return "EMPTY"; - case FileType::ELF_OBJ: return "ELF_OBJ"; - case FileType::ELF_DSO: return "ELF_DSO"; - case FileType::MACH_EXE: return "MACH_EXE"; - case FileType::MACH_OBJ: return "MACH_OBJ"; - case FileType::MACH_DYLIB: return "MACH_DYLIB"; - case FileType::MACH_BUNDLE: return "MACH_BUNDLE"; - case FileType::MACH_UNIVERSAL: return "MACH_UNIVERSAL"; - case FileType::AR: return "AR"; - case FileType::THIN_AR: return "THIN_AR"; - case FileType::TAPI: return "TAPI"; - case FileType::TEXT: return "TEXT"; - case FileType::GCC_LTO_OBJ: return "GCC_LTO_OBJ"; - case FileType::LLVM_BITCODE: return "LLVM_BITCODE"; - } - return "UNKNOWN"; -} - inline std::ostream &operator<<(std::ostream &out, FileType type) { - out << filetype_to_string(type); + auto to_string = [&] { + switch (type) { + case FileType::UNKNOWN: return "UNKNOWN"; + case FileType::EMPTY: return "EMPTY"; + case FileType::ELF_OBJ: return "ELF_OBJ"; + case FileType::ELF_DSO: return "ELF_DSO"; + case FileType::AR: return "AR"; + case FileType::THIN_AR: return "THIN_AR"; + case FileType::TEXT: return "TEXT"; + case FileType::GCC_LTO_OBJ: return "GCC_LTO_OBJ"; + case FileType::LLVM_BITCODE: return "LLVM_BITCODE"; + default: return "UNKNOWN"; + } + }; + + out << to_string(); return out; } diff --git a/elf/gc-sections.cc b/src/gc-sections.cc similarity index 86% rename from elf/gc-sections.cc rename to src/gc-sections.cc index 31764098..efc6cd6d 100644 --- a/elf/gc-sections.cc +++ b/src/gc-sections.cc @@ -7,7 +7,7 @@ #include #include -namespace mold::elf { +namespace mold { template static bool should_keep(const InputSection &isec) { @@ -15,6 +15,10 @@ static bool should_keep(const InputSection &isec) { u32 flags = isec.shdr().sh_flags; std::string_view name = isec.name(); + if constexpr (is_ppc32) + if (name == ".got2") + return true; + return (flags & SHF_GNU_RETAIN) || type == SHT_NOTE || type == SHT_INIT_ARRAY || @@ -34,44 +38,10 @@ static bool mark_section(InputSection *isec) { } template -static void visit(Context &ctx, InputSection *isec, - tbb::feeder *> &feeder, i64 depth) { - assert(isec->is_visited); - - // If this is a text section, .eh_frame may contain records - // describing how to handle exceptions for that function. - // We want to keep associated .eh_frame records. - for (FdeRecord &fde : isec->get_fdes()) - for (const ElfRel &rel : fde.get_rels(isec->file).subspan(1)) - if (Symbol *sym = isec->file.symbols[rel.r_sym]) - if (mark_section(sym->get_input_section())) - feeder.add(sym->get_input_section()); - - for (const ElfRel &rel : isec->get_rels(ctx)) { - Symbol &sym = *isec->file.symbols[rel.r_sym]; - - // Symbol can refer either a section fragment or an input section. - // Mark a fragment as alive. - if (SectionFragment *frag = sym.get_frag()) { - frag->is_alive = true; - continue; - } - - // Mark a section alive. For better performacne, we don't call - // `feeder.add` too often. - if (mark_section(sym.get_input_section())) { - if (depth < 3) - visit(ctx, sym.get_input_section(), feeder, depth + 1); - else - feeder.add(sym.get_input_section()); - } - } -} - -template -static void collect_root_set(Context &ctx, - tbb::concurrent_vector *> &rootset) { +static tbb::concurrent_vector *> +collect_root_set(Context &ctx) { Timer t(ctx, "collect_root_set"); + tbb::concurrent_vector *> rootset; auto enqueue_section = [&](InputSection *isec) { if (mark_section(isec)) @@ -106,22 +76,13 @@ static void collect_root_set(Context &ctx, } }); - // Add sections containing exported symbols + // Add sections containing gc root or exported symbols tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (Symbol *sym : file->symbols) - if (sym->file == file && sym->is_exported) + if (sym->file == file && (sym->gc_root || sym->is_exported)) enqueue_symbol(sym); }); - // Add sections referenced by root symbols. - enqueue_symbol(get_symbol(ctx, ctx.arg.entry)); - - for (Symbol *sym : ctx.arg.undefined) - enqueue_symbol(sym); - - for (Symbol *sym : ctx.arg.require_defined) - enqueue_symbol(sym); - // .eh_frame consists of variable-length records called CIE and FDE // records, and they are a unit of inclusion or exclusion. // We just keep all CIEs and everything that are referenced by them. @@ -130,6 +91,43 @@ static void collect_root_set(Context &ctx, for (const ElfRel &rel : cie.get_rels()) enqueue_symbol(file->symbols[rel.r_sym]); }); + + return rootset; +} + +template +static void visit(Context &ctx, InputSection *isec, + tbb::feeder *> &feeder, i64 depth) { + assert(isec->is_visited); + + // If this is a text section, .eh_frame may contain records + // describing how to handle exceptions for that function. + // We want to keep associated .eh_frame records. + for (FdeRecord &fde : isec->get_fdes()) + for (const ElfRel &rel : fde.get_rels(isec->file).subspan(1)) + if (Symbol *sym = isec->file.symbols[rel.r_sym]) + if (mark_section(sym->get_input_section())) + feeder.add(sym->get_input_section()); + + for (const ElfRel &rel : isec->get_rels(ctx)) { + Symbol &sym = *isec->file.symbols[rel.r_sym]; + + // Symbol can refer to either a section fragment or an input section. + // Mark a fragment as alive. + if (SectionFragment *frag = sym.get_frag()) { + frag->is_alive = true; + continue; + } + + // Mark a section alive. For better performacne, we don't call + // `feeder.add` too often. + if (mark_section(sym.get_input_section())) { + if (depth < 3) + visit(ctx, sym.get_input_section(), feeder, depth + 1); + else + feeder.add(sym.get_input_section()); + } + } } // Mark all reachable sections @@ -139,7 +137,7 @@ static void mark(Context &ctx, Timer t(ctx, "mark"); tbb::parallel_for_each(rootset, [&](InputSection *isec, - tbb::feeder *> &feeder) { + tbb::feeder *> &feeder) { visit(ctx, isec, feeder, 0); }); } @@ -154,7 +152,7 @@ static void sweep(Context &ctx) { for (std::unique_ptr> &isec : file->sections) { if (isec && isec->is_alive && !isec->is_visited) { if (ctx.arg.print_gc_sections) - SyncOut(ctx) << "removing unused section " << *isec; + Out(ctx) << "removing unused section " << *isec; isec->kill(); counter++; } @@ -165,9 +163,7 @@ static void sweep(Context &ctx) { template void gc_sections(Context &ctx) { Timer t(ctx, "gc"); - - tbb::concurrent_vector *> rootset; - collect_root_set(ctx, rootset); + tbb::concurrent_vector *> rootset = collect_root_set(ctx); mark(ctx, rootset); sweep(ctx); } @@ -176,4 +172,4 @@ using E = MOLD_TARGET; template void gc_sections(Context &ctx); -} // namespace mold::elf +} // namespace mold diff --git a/elf/gdb-index.cc b/src/gdb-index.cc similarity index 98% rename from elf/gdb-index.cc rename to src/gdb-index.cc index 646d21ea..a87b7691 100644 --- a/elf/gdb-index.cc +++ b/src/gdb-index.cc @@ -60,7 +60,7 @@ #include #include -namespace mold::elf { +namespace mold { enum DwarfKind { DWARF2_32, DWARF5_32, DWARF2_64, DWARF5_64 }; @@ -135,13 +135,13 @@ struct NameType { } std::string_view name; - u32 hash; + u64 hash; u8 type; }; struct MapValue { - u32 hash = 0; - Atomic count = 0; + u32 gdb_hash = 0; + Atomic count; u32 name_offset = 0; u32 type_offset = 0; }; @@ -539,7 +539,7 @@ static i64 read_pubnames_cu(Context &ctx, const PubnamesHdr &hdr, u8 type = *p++; std::string_view name = (char *)p; p += name.size() + 1; - cu->nametypes.push_back({name, gdb_hash(name), type}); + cu->nametypes.push_back({name, hash_string(name), type}); } return size; @@ -683,7 +683,8 @@ void write_gdb_index(Context &ctx) { for (NameType &nt : cu.nametypes) { MapValue *ent; bool inserted; - std::tie(ent, inserted) = map.insert(nt.name, nt.hash, MapValue{nt.hash}); + std::tie(ent, inserted) = map.insert(nt.name, nt.hash, + MapValue{gdb_hash(nt.name)}); ent->count++; cu.entries.push_back(ent); } @@ -750,7 +751,7 @@ void write_gdb_index(Context &ctx) { ul32 *ht = (ul32 *)(buf + hdr.symtab_offset); for (Entry *ent : entries) { - u32 hash = ent->value.hash; + u32 hash = ent->value.gdb_hash; u32 step = ((hash * 17) & mask) | 1; u32 j = hash & mask; @@ -790,4 +791,4 @@ using E = MOLD_TARGET; template void write_gdb_index(Context &); -} // namespace mold::elf +} // namespace mold diff --git a/elf/icf.cc b/src/icf.cc similarity index 91% rename from elf/icf.cc rename to src/icf.cc index 1d8b2e0b..cdf70760 100644 --- a/elf/icf.cc +++ b/src/icf.cc @@ -65,7 +65,7 @@ // conditions. #include "mold.h" -#include "blake3.h" +#include "../lib/siphash.h" #include #include @@ -78,34 +78,43 @@ static constexpr int64_t HASH_SIZE = 16; -typedef std::array Digest; +using Digest = std::array; namespace std { template <> struct hash { size_t operator()(const Digest &k) const { - return *(int64_t *)&k[0]; + static_assert(sizeof(size_t) <= HASH_SIZE); + size_t val; + memcpy(&val, k.data(), sizeof(size_t)); + return val; } }; } -namespace mold::elf { +namespace mold { + +static u8 hmac_key[16]; template static void uniquify_cies(Context &ctx) { Timer t(ctx, "uniquify_cies"); std::vector *> cies; + auto find = [&](CieRecord &cie) -> i64 { + for (i64 i = 0; i < cies.size(); i++) + if (cie_equals(cie, *cies[i])) + return i; + return -1; + }; + for (ObjectFile *file : ctx.objs) { for (CieRecord &cie : file->cies) { - for (i64 i = 0; i < cies.size(); i++) { - if (cie.equals(*cies[i])) { - cie.icf_idx = i; - goto found; - } + if (i64 idx = find(cie); idx != -1) { + cie.icf_idx = idx; + } else { + cie.icf_idx = cies.size(); + cies.push_back(&cie); } - cie.icf_idx = cies.size(); - cies.push_back(&cie); - found:; } } } @@ -119,26 +128,14 @@ static bool is_eligible(Context &ctx, InputSection &isec) { shdr.sh_type == SHT_NOBITS || is_c_identifier(name)) return false; - if (shdr.sh_flags & SHF_EXECINSTR) { + if (shdr.sh_flags & SHF_EXECINSTR) return (ctx.arg.icf_all || !isec.address_taken) && name != ".init" && name != ".fini"; - } else { - bool is_readonly = !(shdr.sh_flags & SHF_WRITE); - bool is_relro = isec.output_section && isec.output_section->is_relro; - return (ctx.arg.ignore_data_address_equality || !isec.address_taken) && - (is_readonly || is_relro); - } -} -static Digest digest_final(blake3_hasher *hasher) { - assert(HASH_SIZE <= BLAKE3_OUT_LEN); - - u8 buf[BLAKE3_OUT_LEN]; - blake3_hasher_finalize(hasher, buf, BLAKE3_OUT_LEN); - - Digest digest; - memcpy(digest.data(), buf, HASH_SIZE); - return digest; + bool is_readonly = !(shdr.sh_flags & SHF_WRITE); + bool is_relro = isec.output_section && isec.output_section->is_relro; + return (ctx.arg.ignore_data_address_equality || !isec.address_taken) && + (is_readonly || is_relro); } template @@ -234,16 +231,15 @@ static void merge_leaf_nodes(Context &ctx) { template static Digest compute_digest(Context &ctx, InputSection &isec) { - blake3_hasher hasher; - blake3_hasher_init(&hasher); + SipHash13_128 hasher(hmac_key); auto hash = [&](auto val) { - blake3_hasher_update(&hasher, (u8 *)&val, sizeof(val)); + hasher.update((u8 *)&val, sizeof(val)); }; auto hash_string = [&](std::string_view str) { hash(str.size()); - blake3_hasher_update(&hasher, (u8 *)str.data(), str.size()); + hasher.update((u8 *)str.data(), str.size()); }; auto hash_symbol = [&](Symbol &sym) { @@ -299,7 +295,9 @@ static Digest compute_digest(Context &ctx, InputSection &isec) { hash_symbol(*isec.file.symbols[rel.r_sym]); } - return digest_final(&hasher); + Digest digest; + hasher.finish(digest.data()); + return digest; } template @@ -400,7 +398,7 @@ static void gather_edges(Context &ctx, template static i64 propagate(std::span> digests, std::span edges, std::span edge_indices, - bool &slot, BitVector &converged, + bool &slot, std::span converged, tbb::affinity_partitioner &ap) { static Counter round("icf_round"); round++; @@ -409,25 +407,24 @@ static i64 propagate(std::span> digests, tbb::enumerable_thread_specific changed; tbb::parallel_for((i64)0, num_digests, [&](i64 i) { - if (converged.get(i)) + if (converged[i]) return; - blake3_hasher hasher; - blake3_hasher_init(&hasher); - blake3_hasher_update(&hasher, digests[2][i].data(), HASH_SIZE); + SipHash13_128 hasher(hmac_key); + hasher.update(digests[2][i].data(), HASH_SIZE); i64 begin = edge_indices[i]; i64 end = (i + 1 == num_digests) ? edges.size() : edge_indices[i + 1]; for (i64 j : edges.subspan(begin, end - begin)) - blake3_hasher_update(&hasher, digests[slot][j].data(), HASH_SIZE); + hasher.update(digests[slot][j].data(), HASH_SIZE); - digests[!slot][i] = digest_final(&hasher); + hasher.finish(digests[!slot][i].data()); if (digests[slot][i] == digests[!slot][i]) { // This node has converged. Skip further iterations as it will // yield the same hash. - converged.set(i); + converged[i] = true; } else { changed.local()++; } @@ -479,17 +476,17 @@ static void print_icf_sections(Context &ctx) { if (begin == end) continue; - SyncOut(ctx) << "selected section " << *leader; + Out(ctx) << "selected section " << *leader; i64 n = 0; for (auto it = begin; it != end; it++) { - SyncOut(ctx) << " removing identical section " << *it->second; + Out(ctx) << " removing identical section " << *it->second; n++; } saved_bytes += leader->contents.size() * n; } - SyncOut(ctx) << "ICF saved " << saved_bytes << " bytes"; + Out(ctx) << "ICF saved " << saved_bytes << " bytes"; } template @@ -498,6 +495,8 @@ void icf_sections(Context &ctx) { if (ctx.objs.empty()) return; + get_random_bytes(hmac_key, sizeof(hmac_key)); + uniquify_cies(ctx); merge_leaf_nodes(ctx); @@ -522,7 +521,7 @@ void icf_sections(Context &ctx) { std::vector edge_indices; gather_edges(ctx, sections, edges, edge_indices); - BitVector converged(digests[0].size()); + std::vector converged(digests[0].size()); bool slot = 0; // Execute the propagation rounds until convergence is obtained. @@ -565,7 +564,7 @@ void icf_sections(Context &ctx) { } } - // Group sections by BLAKE3 digest. + // Group sections by hash values. { Timer t(ctx, "group"); @@ -600,7 +599,7 @@ void icf_sections(Context &ctx) { static Counter eliminated("icf_eliminated"); tbb::parallel_for_each(ctx.objs, [](ObjectFile *file) { for (std::unique_ptr> &isec : file->sections) { - if (isec && isec->is_alive && isec->is_killed_by_icf()) { + if (isec && isec->is_alive && isec->icf_removed()) { isec->kill(); eliminated++; } @@ -613,4 +612,4 @@ using E = MOLD_TARGET; template void icf_sections(Context &ctx); -} // namespace mold::elf +} // namespace mold diff --git a/elf/input-files.cc b/src/input-files.cc similarity index 75% rename from elf/input-files.cc rename to src/input-files.cc index 49580a09..afe1fc1e 100644 --- a/elf/input-files.cc +++ b/src/input-files.cc @@ -8,10 +8,61 @@ # include #endif -namespace mold::elf { +namespace mold { +// If we haven't seen the same `key` before, create a new instance +// of Symbol and returns it. Otherwise, returns the previously- +// instantiated object. `key` is usually the same as `name`. template -InputFile::InputFile(Context &ctx, MappedFile> *mf) +Symbol *get_symbol(Context &ctx, std::string_view key, + std::string_view name) { + typename decltype(ctx.symbol_map)::const_accessor acc; + ctx.symbol_map.insert(acc, {key, Symbol(name, ctx.arg.demangle)}); + return const_cast *>(&acc->second); +} + +template +Symbol *get_symbol(Context &ctx, std::string_view key) { + std::string_view name = key.substr(0, key.find('@')); + return get_symbol(ctx, key, name); +} + +template +static bool is_rust_symbol(const Symbol &sym) { + // The legacy Rust mangling scheme is indistinguishtable from C++. + // We don't want to accidentally demangle C++ symbols as Rust ones. + // So, the legacy mangling scheme will be demangled only when we + // know the object file was created by rustc. + if (sym.file && !sym.file->is_dso && ((ObjectFile *)sym.file)->is_rust_obj) + return true; + + // "_R" is the prefix of the new Rust mangling scheme. + return sym.name().starts_with("_R"); +} + +template +std::string_view demangle(const Symbol &sym) { + if (is_rust_symbol(sym)) { + if (std::optional s = demangle_rust(sym.name())) + return *s; + } else { + if (std::optional s = demangle_cpp(sym.name())) + return *s; + } + return sym.name(); +} + +template +std::ostream &operator<<(std::ostream &out, const Symbol &sym) { + if (sym.demangle) + out << demangle(sym); + else + out << sym.name(); + return out; +} + +template +InputFile::InputFile(Context &ctx, MappedFile *mf) : mf(mf), filename(mf->name) { if (mf->size < sizeof(ElfEhdr)) Fatal(ctx) << *this << ": file too small"; @@ -56,22 +107,6 @@ ElfShdr *InputFile::find_section(i64 type) { return nullptr; } -template -void InputFile::clear_symbols() { - for (Symbol *sym : get_global_syms()) { - if (__atomic_load_n(&sym->file, __ATOMIC_ACQUIRE) == this) { - sym->origin = 0; - sym->value = -1; - sym->sym_idx = -1; - sym->ver_idx = VER_NDX_UNSPECIFIED; - sym->is_weak = false; - sym->is_imported = false; - sym->is_exported = false; - __atomic_store_n(&sym->file, nullptr, __ATOMIC_RELEASE); - } - } -} - // Find the source filename. It should be listed in symtab as STT_FILE. template std::string_view InputFile::get_source_name() const { @@ -82,7 +117,7 @@ std::string_view InputFile::get_source_name() const { } template -ObjectFile::ObjectFile(Context &ctx, MappedFile> *mf, +ObjectFile::ObjectFile(Context &ctx, MappedFile *mf, std::string archive_name, bool is_in_lib) : InputFile(ctx, mf), archive_name(archive_name), is_in_lib(is_in_lib) { this->is_alive = !is_in_lib; @@ -90,7 +125,7 @@ ObjectFile::ObjectFile(Context &ctx, MappedFile> *mf, template ObjectFile * -ObjectFile::create(Context &ctx, MappedFile> *mf, +ObjectFile::create(Context &ctx, MappedFile *mf, std::string archive_name, bool is_in_lib) { ObjectFile *obj = new ObjectFile(ctx, mf, archive_name, is_in_lib); ctx.obj_pool.emplace_back(obj); @@ -191,11 +226,44 @@ static void read_riscv_attributes(Context &ctx, ObjectFile &file, } } +template +static bool is_known_section_type(const ElfShdr &shdr) { + u32 ty = shdr.sh_type; + u32 flags = shdr.sh_flags; + + if (ty == SHT_PROGBITS || + ty == SHT_NOTE || + ty == SHT_NOBITS || + ty == SHT_INIT_ARRAY || + ty == SHT_FINI_ARRAY || + ty == SHT_PREINIT_ARRAY) + return true; + + if (SHT_LOUSER <= ty && ty <= SHT_HIUSER && !(flags & SHF_ALLOC)) + return true; + if (SHT_LOOS <= ty && ty <= SHT_HIOS && !(flags & SHF_OS_NONCONFORMING)) + return true; + if (is_x86_64 && ty == SHT_X86_64_UNWIND) + return true; + if (is_arm32 && (ty == SHT_ARM_EXIDX || ty == SHT_ARM_ATTRIBUTES)) + return true; + if (is_riscv && ty == SHT_RISCV_ATTRIBUTES) + return true; + return false; +} + template void ObjectFile::initialize_sections(Context &ctx) { // Read sections for (i64 i = 0; i < this->elf_sections.size(); i++) { const ElfShdr &shdr = this->elf_sections[i]; + std::string_view name = this->shstrtab.data() + shdr.sh_name; + + if ((shdr.sh_flags & SHF_EXCLUDE) && + name.starts_with(".gnu.offload_lto_.symtab.")) { + this->is_gcc_offload_obj = true; + continue; + } if ((shdr.sh_flags & SHF_EXCLUDE) && !(shdr.sh_flags & SHF_ALLOC) && shdr.sh_type != SHT_LLVM_ADDRSIG && !ctx.arg.relocatable) @@ -245,7 +313,7 @@ void ObjectFile::initialize_sections(Context &ctx) { typename decltype(ctx.comdat_groups)::const_accessor acc; ctx.comdat_groups.insert(acc, {signature, ComdatGroup()}); ComdatGroup *group = const_cast(&acc->second); - comdat_groups.push_back({group, (u32)i, entries.subspan(1)}); + comdat_groups.push_back({group, (i32)i, entries.subspan(1)}); break; } case SHT_REL: @@ -255,8 +323,10 @@ void ObjectFile::initialize_sections(Context &ctx) { case SHT_STRTAB: case SHT_NULL: break; - default: { - std::string_view name = this->shstrtab.data() + shdr.sh_name; + default: + if (!is_known_section_type(shdr)) + Fatal(ctx) << *this << ": " << name << ": unsupported section type: 0x" + << std::hex << (u32)shdr.sh_type; // .note.GNU-stack section controls executable-ness of the stack // area in GNU linkers. We ignore that section because silently @@ -299,6 +369,10 @@ void ObjectFile::initialize_sections(Context &ctx) { is_debug_section(shdr, name)) continue; + if (name == ".comment" && + this->get_string(ctx, shdr).starts_with("rustc ")) + this->is_rust_obj = true; + // If an output file doesn't have a section header (i.e. // --oformat=binary is given), we discard all non-memory-allocated // sections. This is because without a section header, we can't find @@ -310,21 +384,25 @@ void ObjectFile::initialize_sections(Context &ctx) { // Save .llvm_addrsig for --icf=safe. if (shdr.sh_type == SHT_LLVM_ADDRSIG && !ctx.arg.relocatable) { - llvm_addrsig = std::move(this->sections[i]); + // sh_link should be the index of the symbol table section. + // Tools that mutates the symbol table, such as objcopy or `ld -r` + // tend to not preserve sh_link, so we ignore such section. + if (shdr.sh_link != 0) + llvm_addrsig = std::move(this->sections[i]); continue; } if (shdr.sh_type == SHT_INIT_ARRAY || shdr.sh_type == SHT_FINI_ARRAY || shdr.sh_type == SHT_PREINIT_ARRAY) - ctx.has_init_array = true; + this->has_init_array = true; if (name == ".ctors" || name.starts_with(".ctors.") || name == ".dtors" || name.starts_with(".dtors.")) - ctx.has_ctors = true; + this->has_ctors = true; if (name == ".eh_frame") - eh_frame_section = this->sections[i].get(); + eh_frame_sections.push_back(this->sections[i].get()); if constexpr (is_ppc32) if (name == ".got2") @@ -365,7 +443,6 @@ void ObjectFile::initialize_sections(Context &ctx) { counter++; break; } - } } // Attach relocation sections to their target sections. @@ -416,111 +493,92 @@ void ObjectFile::initialize_sections(Context &ctx) { // This function parses an input .eh_frame section. template void ObjectFile::parse_ehframe(Context &ctx) { - if (!eh_frame_section) - return; - - InputSection &isec = *eh_frame_section; - std::span> rels = isec.get_rels(ctx); - i64 cies_begin = cies.size(); - i64 fdes_begin = fdes.size(); + for (InputSection *isec : eh_frame_sections) { + std::span> rels = isec->get_rels(ctx); + i64 cies_begin = cies.size(); + i64 fdes_begin = fdes.size(); + + // Read CIEs and FDEs until empty. + std::string_view contents = this->get_string(ctx, isec->shdr()); + i64 rel_idx = 0; + + for (std::string_view data = contents; !data.empty();) { + i64 size = *(U32 *)data.data(); + if (size == 0) + break; - // Read CIEs and FDEs until empty. - std::string_view contents = this->get_string(ctx, isec.shdr()); - i64 rel_idx = 0; + i64 begin_offset = data.data() - contents.data(); + i64 end_offset = begin_offset + size + 4; + i64 id = *(U32 *)(data.data() + 4); + data = data.substr(size + 4); - for (std::string_view data = contents; !data.empty();) { - i64 size = *(U32 *)data.data(); - if (size == 0) - break; + i64 rel_begin = rel_idx; + while (rel_idx < rels.size() && rels[rel_idx].r_offset < end_offset) + rel_idx++; + assert(rel_idx == rels.size() || begin_offset <= rels[rel_begin].r_offset); - i64 begin_offset = data.data() - contents.data(); - i64 end_offset = begin_offset + size + 4; - i64 id = *(U32 *)(data.data() + 4); - data = data.substr(size + 4); + if (id == 0) { + // This is CIE. + cies.emplace_back(ctx, *this, *isec, begin_offset, rels, rel_begin); + } else { + // This is FDE. + if (rel_begin == rel_idx || rels[rel_begin].r_sym == 0) { + // FDE has no valid relocation, which means FDE is dead from + // the beginning. Compilers usually don't create such FDE, but + // `ld -r` tend to generate such dead FDEs. + continue; + } - i64 rel_begin = rel_idx; - while (rel_idx < rels.size() && rels[rel_idx].r_offset < end_offset) - rel_idx++; - assert(rel_idx == rels.size() || begin_offset <= rels[rel_begin].r_offset); + if (rels[rel_begin].r_offset - begin_offset != 8) + Fatal(ctx) << *isec << ": FDE's first relocation should have offset 8"; - if (id == 0) { - // This is CIE. - cies.emplace_back(ctx, *this, isec, begin_offset, rels, rel_begin); - } else { - // This is FDE. - if (rel_begin == rel_idx || rels[rel_begin].r_sym == 0) { - // FDE has no valid relocation, which means FDE is dead from - // the beginning. Compilers usually don't create such FDE, but - // `ld -r` tend to generate such dead FDEs. - continue; + fdes.emplace_back(begin_offset, rel_begin); } - - if (rels[rel_begin].r_offset - begin_offset != 8) - Fatal(ctx) << isec << ": FDE's first relocation should have offset 8"; - - fdes.emplace_back(begin_offset, rel_begin); } - } - // Associate CIEs to FDEs. - auto find_cie = [&](i64 offset) { - for (i64 i = cies_begin; i < cies.size(); i++) - if (cies[i].input_offset == offset) - return i; - Fatal(ctx) << isec << ": bad FDE pointer"; - }; + // Associate CIEs to FDEs. + auto find_cie = [&](i64 offset) { + for (i64 i = cies_begin; i < cies.size(); i++) + if (cies[i].input_offset == offset) + return i; + Fatal(ctx) << *isec << ": bad FDE pointer"; + }; - for (i64 i = fdes_begin; i < fdes.size(); i++) { - i64 cie_offset = *(I32 *)(contents.data() + fdes[i].input_offset + 4); - fdes[i].cie_idx = find_cie(fdes[i].input_offset + 4 - cie_offset); + for (i64 i = fdes_begin; i < fdes.size(); i++) { + i64 cie_offset = *(I32 *)(contents.data() + fdes[i].input_offset + 4); + fdes[i].cie_idx = find_cie(fdes[i].input_offset + 4 - cie_offset); + } } - auto get_isec = [&](const FdeRecord &fde) -> InputSection * { - return get_section(this->elf_syms[rels[fde.rel_idx].r_sym]); + auto get_isec = [&](const FdeRecord &fde) { + return get_section(this->elf_syms[fde.get_rels(*this)[0].r_sym]); }; // We assume that FDEs for the same input sections are contiguous // in `fdes` vector. - std::stable_sort(fdes.begin() + fdes_begin, fdes.end(), - [&](const FdeRecord &a, const FdeRecord &b) { + sort(fdes, [&](const FdeRecord &a, const FdeRecord &b) { return get_isec(a)->get_priority() < get_isec(b)->get_priority(); }); // Associate FDEs to input sections. - for (i64 i = fdes_begin; i < fdes.size();) { + for (i64 i = 0; i < fdes.size();) { InputSection *isec = get_isec(fdes[i]); assert(isec->fde_begin == -1); - isec->fde_begin = i++; - while (i < fdes.size() && isec == get_isec(fdes[i])) - i++; - isec->fde_end = i; - } -} - -// Returns a symbol object for a given key. This function handles -// the -wrap option. -template -static Symbol *insert_symbol(Context &ctx, const ElfSym &esym, - std::string_view key, std::string_view name) { - if (esym.is_undef() && name.starts_with("__real_") && - ctx.arg.wrap.contains(name.substr(7))) { - return get_symbol(ctx, key.substr(7), name.substr(7)); - } - - Symbol *sym = get_symbol(ctx, key, name); - - if (esym.is_undef() && sym->is_wrapped) { - key = save_string(ctx, "__wrap_" + std::string(key)); - name = save_string(ctx, "__wrap_" + std::string(name)); - return get_symbol(ctx, key, name); + if (isec->is_alive) { + isec->fde_begin = i++; + while (i < fdes.size() && isec == get_isec(fdes[i])) + i++; + isec->fde_end = i; + } else { + fdes[i++].is_alive = false; + } } - return sym; } template void ObjectFile::initialize_symbols(Context &ctx) { - if (!symtab_sec) + if (this->elf_syms.empty()) return; static Counter counter("all_syms"); @@ -564,6 +622,9 @@ void ObjectFile::initialize_symbols(Context &ctx) { for (i64 i = this->first_global; i < this->elf_syms.size(); i++) { const ElfSym &esym = this->elf_syms[i]; + if (esym.is_common()) + has_common_symbol = true; + // Get a symbol name std::string_view key = this->symbol_strtab.data() + esym.st_name; std::string_view name = key; @@ -580,9 +641,21 @@ void ObjectFile::initialize_symbols(Context &ctx) { } } - this->symbols[i] = insert_symbol(ctx, esym, key, name); - if (esym.is_common()) - has_common_symbol = true; + // Handle --wrap option + Symbol *sym; + if (esym.is_undef() && name.starts_with("__real_") && + ctx.arg.wrap.contains(name.substr(7))) { + sym = get_symbol(ctx, key.substr(7), name.substr(7)); + } else { + sym = get_symbol(ctx, key, name); + if (esym.is_undef() && sym->is_wrapped) { + key = save_string(ctx, "__wrap_" + std::string(key)); + name = save_string(ctx, "__wrap_" + std::string(name)); + sym = get_symbol(ctx, key, name); + } + } + + this->symbols[i] = sym; } } @@ -608,109 +681,27 @@ void ObjectFile::sort_relocations(Context &ctx) { } } -static size_t find_null(std::string_view data, u64 entsize) { - if (entsize == 1) - return data.find('\0'); - - for (i64 i = 0; i <= data.size() - entsize; i += entsize) - if (data.substr(i, entsize).find_first_not_of('\0') == data.npos) - return i; - - return data.npos; -} - -// Mergeable sections (sections with SHF_MERGE bit) typically contain -// string literals. Linker is expected to split the section contents -// into null-terminated strings, merge them with mergeable strings -// from other object files, and emit uniquified strings to an output -// file. -// -// This mechanism reduces the size of an output file. If two source -// files happen to contain the same string literal, the output will -// contain only a single copy of it. -// -// It is less common than string literals, but mergeable sections can -// contain fixed-sized read-only records too. -// -// This function splits the section contents into small pieces that we -// call "section fragments". Section fragment is a unit of merging. -// -// We do not support mergeable sections that have relocations. template -static std::unique_ptr> -split_section(Context &ctx, InputSection &sec) { - if (!sec.is_alive || sec.relsec_idx != -1) - return nullptr; - - const ElfShdr &shdr = sec.shdr(); - if (!(shdr.sh_flags & SHF_MERGE)) - return nullptr; - - i64 entsize = shdr.sh_entsize; - if (entsize == 0) - entsize = (shdr.sh_flags & SHF_STRINGS) ? 1 : (int)shdr.sh_addralign; - - if (entsize == 0) - return nullptr; - - i64 addralign = shdr.sh_addralign; - if (addralign == 0) - addralign = 1; - - std::unique_ptr> rec(new MergeableSection); - rec->parent = MergedSection::get_instance(ctx, sec.name(), shdr.sh_type, - shdr.sh_flags, entsize, addralign); - rec->p2align = sec.p2align; - - if (sec.sh_size == 0) - return rec; - - // If thes section contents are compressed, uncompress them. - sec.uncompress(ctx); - - std::string_view data = sec.contents; - const char *begin = data.data(); - HyperLogLog estimator; - - // Split sections - if (shdr.sh_flags & SHF_STRINGS) { - while (!data.empty()) { - size_t end = find_null(data, entsize); - if (end == data.npos) - Fatal(ctx) << sec << ": string is not null terminated"; - - std::string_view substr = data.substr(0, end + entsize); - data = data.substr(end + entsize); - - rec->strings.push_back(substr); - rec->frag_offsets.push_back(substr.data() - begin); - - u64 hash = hash_string(substr); - rec->hashes.push_back(hash); - estimator.insert(hash); - } - } else { - if (data.size() % entsize) - Fatal(ctx) << sec << ": section size is not multiple of sh_entsize"; +void ObjectFile::convert_mergeable_sections(Context &ctx) { + // Convert InputSections to MergeableSections + for (i64 i = 0; i < this->sections.size(); i++) { + InputSection *isec = this->sections[i].get(); + if (!isec || isec->sh_size == 0 || isec->relsec_idx != -1) + continue; - while (!data.empty()) { - std::string_view substr = data.substr(0, entsize); - data = data.substr(entsize); + const ElfShdr &shdr = isec->shdr(); + if (!(shdr.sh_flags & SHF_MERGE)) + continue; - rec->strings.push_back(substr); - rec->frag_offsets.push_back(substr.data() - begin); + MergedSection *parent = + MergedSection::get_instance(ctx, isec->name(), shdr); - u64 hash = hash_string(substr); - rec->hashes.push_back(hash); - estimator.insert(hash); + if (parent) { + this->mergeable_sections[i] = + std::make_unique>(ctx, *parent, this->sections[i]); + this->sections[i] = nullptr; } } - - rec->parent->estimator.merge(estimator); - - static Counter counter("string_fragments"); - counter += rec->fragments.size(); - return rec; } // Usually a section is an atomic unit of inclusion or exclusion. @@ -749,40 +740,17 @@ split_section(Context &ctx, InputSection &sec) { // section piece in a section, but it doesn't do for any other types // of symbols. // -// In mold, we attach symbols to section pieces. If a relocation refers -// to a section symbol, and that symbol's section is a mergeable one, -// we create a new dummy symbol for a section piece and redirect the -// relocation to this new symbol. If a non-section symbol refers to a -// section piece, the section piece is attached to the symbol. -template -void ObjectFile::initialize_mergeable_sections(Context &ctx) { - mergeable_sections.resize(sections.size()); - - for (i64 i = 0; i < sections.size(); i++) { - if (std::unique_ptr> &isec = sections[i]) { - if (std::unique_ptr> m = split_section(ctx, *isec)) { - mergeable_sections[i] = std::move(m); - isec->is_alive = false; - } - } - } -} - +// Section garbage collection and Identical Code Folding work on graphs +// where sections or section pieces are vertices and relocations are +// edges. To make it easy to handle them, we rewrite symbols and +// relocations so that each non-absolute symbol always refers to either +// a non-mergeable section or a section piece. +// +// We do that only for SHF_ALLOC sections because GC and ICF work only +// on memory-allocated sections. Non-memory-allocated mergeable sections +// are not handled here for performance reasons. template -void ObjectFile::resolve_section_pieces(Context &ctx) { - for (std::unique_ptr> &m : mergeable_sections) { - if (m) { - m->fragments.reserve(m->strings.size()); - for (i64 i = 0; i < m->strings.size(); i++) - m->fragments.push_back(m->parent->insert(ctx, m->strings[i], m->hashes[i], - m->p2align)); - - // Shrink vectors that we will never use again to reclaim memory. - m->strings.clear(); - m->hashes.clear(); - } - } - +void ObjectFile::reattach_section_pieces(Context &ctx) { // Attach section pieces to symbols. for (i64 i = 1; i < this->elf_syms.size(); i++) { Symbol &sym = *this->symbols[i]; @@ -791,8 +759,9 @@ void ObjectFile::resolve_section_pieces(Context &ctx) { if (esym.is_abs() || esym.is_common() || esym.is_undef()) continue; - std::unique_ptr> &m = mergeable_sections[get_shndx(esym)]; - if (!m || m->fragments.empty()) + i64 shndx = get_shndx(esym); + std::unique_ptr> &m = mergeable_sections[shndx]; + if (!m || !m->parent.resolved) continue; SectionFragment *frag; @@ -809,49 +778,51 @@ void ObjectFile::resolve_section_pieces(Context &ctx) { // Compute the size of frag_syms. i64 nfrag_syms = 0; for (std::unique_ptr> &isec : sections) - if (isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC)) + if (isec && (isec->shdr().sh_flags & SHF_ALLOC)) for (ElfRel &r : isec->get_rels(ctx)) if (const ElfSym &esym = this->elf_syms[r.r_sym]; - esym.st_type == STT_SECTION && mergeable_sections[get_shndx(esym)]) - nfrag_syms++; + esym.st_type == STT_SECTION) + if (mergeable_sections[get_shndx(esym)]) + nfrag_syms++; this->frag_syms.resize(nfrag_syms); - // For each relocation referring a mergeable section symbol, we create - // a new dummy non-section symbol and redirect the relocation to the - // newly-created symbol. + // For each relocation referring to a mergeable section symbol, we + // create a new dummy non-section symbol and redirect the relocation + // to the newly created symbol. i64 idx = 0; for (std::unique_ptr> &isec : sections) { - if (!isec || !isec->is_alive || !(isec->shdr().sh_flags & SHF_ALLOC)) - continue; - - for (ElfRel &r : isec->get_rels(ctx)) { - const ElfSym &esym = this->elf_syms[r.r_sym]; - if (esym.st_type != STT_SECTION) - continue; - - std::unique_ptr> &m = mergeable_sections[get_shndx(esym)]; - if (!m) - continue; - - i64 r_addend = get_addend(*isec, r); - - SectionFragment *frag; - i64 in_frag_offset; - std::tie(frag, in_frag_offset) = m->get_fragment(esym.st_value + r_addend); - - if (!frag) - Fatal(ctx) << *this << ": bad relocation at " << r.r_sym; - - Symbol &sym = this->frag_syms[idx]; - sym.file = this; - sym.set_name(""); - sym.sym_idx = r.r_sym; - sym.visibility = STV_HIDDEN; - sym.set_frag(frag); - sym.value = in_frag_offset - r_addend; - r.r_sym = this->elf_syms.size() + idx; - idx++; + if (isec && (isec->shdr().sh_flags & SHF_ALLOC)) { + for (ElfRel &r : isec->get_rels(ctx)) { + const ElfSym &esym = this->elf_syms[r.r_sym]; + if (esym.st_type != STT_SECTION) + continue; + + i64 shndx = get_shndx(esym); + std::unique_ptr> &m = mergeable_sections[shndx]; + if (!m) + continue; + + assert(m->parent.resolved); + + i64 r_addend = get_addend(*isec, r); + SectionFragment *frag; + i64 in_frag_offset; + std::tie(frag, in_frag_offset) = m->get_fragment(esym.st_value + r_addend); + + if (!frag) + Fatal(ctx) << *this << ": bad relocation at " << r.r_sym; + + Symbol &sym = this->frag_syms[idx]; + sym.file = this; + sym.set_name(""); + sym.sym_idx = r.r_sym; + sym.visibility = STV_HIDDEN; + sym.set_frag(frag); + sym.value = in_frag_offset - r_addend; + r.r_sym = this->elf_syms.size() + idx; + idx++; + } } } @@ -864,6 +835,8 @@ void ObjectFile::resolve_section_pieces(Context &ctx) { template void ObjectFile::parse(Context &ctx) { sections.resize(this->elf_sections.size()); + mergeable_sections.resize(sections.size()); + symtab_sec = this->find_section(SHT_SYMTAB); if (symtab_sec) { @@ -880,7 +853,6 @@ void ObjectFile::parse(Context &ctx) { initialize_sections(ctx); initialize_symbols(ctx); sort_relocations(ctx); - parse_ehframe(ctx); } // Symbols with higher priorities overwrites symbols with lower priorities. @@ -954,11 +926,11 @@ template static void print_trace_symbol(Context &ctx, InputFile &file, const ElfSym &esym, Symbol &sym) { if (!esym.is_undef()) - SyncOut(ctx) << "trace-symbol: " << file << ": definition of " << sym; + Out(ctx) << "trace-symbol: " << file << ": definition of " << sym; else if (esym.is_weak()) - SyncOut(ctx) << "trace-symbol: " << file << ": weak reference to " << sym; + Out(ctx) << "trace-symbol: " << file << ": weak reference to " << sym; else - SyncOut(ctx) << "trace-symbol: " << file << ": reference to " << sym; + Out(ctx) << "trace-symbol: " << file << ": reference to " << sym; } template @@ -1007,19 +979,16 @@ ObjectFile::mark_live_objects(Context &ctx, if (sym.is_traced) print_trace_symbol(ctx, *this, esym, sym); - if (esym.is_weak()) - continue; - - if (!sym.file) - continue; + if (sym.file) { + bool undef_ref = esym.is_undef() && (!esym.is_weak() || sym.file->is_dso); + bool common_ref = esym.is_common() && !sym.esym().is_common(); - bool keep = esym.is_undef() || (esym.is_common() && !sym.esym().is_common()); - if (keep && !sym.file->is_alive.test_and_set()) { - feeder(sym.file); - - if (sym.is_traced) - SyncOut(ctx) << "trace-symbol: " << *this << " keeps " << *sym.file - << " for " << sym; + if ((undef_ref || common_ref) && !sym.file->is_alive.test_and_set()) { + feeder(sym.file); + if (sym.is_traced) + Out(ctx) << "trace-symbol: " << *this << " keeps " << *sym.file + << " for " << sym; + } } } } @@ -1080,39 +1049,31 @@ void ObjectFile::convert_common_symbols(Context &ctx) { continue; Symbol &sym = *this->symbols[i]; - std::scoped_lock lock(sym.mu); - if (sym.file != this) { if (ctx.arg.warn_common) Warn(ctx) << *this << ": multiple common symbols: " << sym; continue; } - elf_sections2.push_back({}); - ElfShdr &shdr = elf_sections2.back(); - memset(&shdr, 0, sizeof(shdr)); - - if (sym.get_type() == STT_TLS) { + ElfShdr shdr = {}; + if (sym.get_type() == STT_TLS) shdr.sh_flags = SHF_ALLOC | SHF_WRITE | SHF_TLS; - } else { + else shdr.sh_flags = SHF_ALLOC | SHF_WRITE; - } shdr.sh_type = SHT_NOBITS; shdr.sh_size = this->elf_syms[i].st_size; shdr.sh_addralign = this->elf_syms[i].st_value; + elf_sections2.push_back(shdr); i64 idx = this->elf_sections.size() + elf_sections2.size() - 1; - std::unique_ptr> isec = - std::make_unique>(ctx, *this, idx); + auto isec = std::make_unique>(ctx, *this, idx); - sym.file = this; sym.set_input_section(isec.get()); sym.value = 0; sym.sym_idx = i; sym.ver_idx = ctx.default_version; sym.is_weak = false; - sections.push_back(std::move(isec)); } } @@ -1142,9 +1103,6 @@ static bool should_write_to_local_symtab(Context &ctx, Symbol &sym) { template void ObjectFile::compute_symtab_size(Context &ctx) { - if (ctx.arg.strip_all) - return; - this->output_sym_indices.resize(this->elf_syms.size(), -1); auto is_alive = [&](Symbol &sym) -> bool { @@ -1236,19 +1194,12 @@ std::ostream &operator<<(std::ostream &out, const InputFile &file) { } template -SharedFile * -SharedFile::create(Context &ctx, MappedFile> *mf) { +SharedFile *SharedFile::create(Context &ctx, MappedFile *mf) { SharedFile *obj = new SharedFile(ctx, mf); ctx.dso_pool.emplace_back(obj); return obj; } -template -SharedFile::SharedFile(Context &ctx, MappedFile> *mf) - : InputFile(ctx, mf) { - this->is_alive = !ctx.as_needed; -} - template std::string SharedFile::get_soname(Context &ctx) { if (ElfShdr *sec = this->find_section(SHT_DYNAMIC)) @@ -1311,6 +1262,32 @@ void SharedFile::parse(Context &ctx) { counter += this->elf_syms.size(); } +template +std::vector SharedFile::get_dt_needed(Context &ctx) { + // Get the contents of the dynamic segment + std::span> dynamic; + for (ElfPhdr &phdr : this->get_phdrs()) + if (phdr.p_type == PT_DYNAMIC) + dynamic = {(Word *)(this->mf->data + phdr.p_offset), + (size_t)phdr.p_memsz / sizeof(Word)}; + + // Find a string table + char *strtab = nullptr; + for (i64 i = 0; i < dynamic.size(); i += 2) + if (dynamic[i] == DT_STRTAB) + strtab = (char *)this->mf->data + dynamic[i + 1]; + + if (!strtab) + return {}; + + // Find all DT_NEEDED entries + std::vector vec; + for (i64 i = 0; i < dynamic.size(); i += 2) + if (dynamic[i] == DT_NEEDED) + vec.push_back(strtab + dynamic[i + 1]); + return vec; +} + // Symbol versioning is a GNU extension to the ELF file format. I don't // particularly like the feature as it complicates the semantics of // dynamic linking, but we need to support it anyway because it is @@ -1343,32 +1320,31 @@ void SharedFile::parse(Context &ctx) { // default version of the library) at load-time. template std::vector SharedFile::read_verdef(Context &ctx) { - std::vector ret(VER_NDX_LAST_RESERVED + 1); - ElfShdr *verdef_sec = this->find_section(SHT_GNU_VERDEF); if (!verdef_sec) - return ret; + return {}; std::string_view verdef = this->get_string(ctx, *verdef_sec); std::string_view strtab = this->get_string(ctx, verdef_sec->sh_link); - ElfVerdef *ver = (ElfVerdef *)verdef.data(); + std::vector vec; + u8 *ptr = (u8 *)verdef.data(); for (;;) { + ElfVerdef *ver = (ElfVerdef *)ptr; if (ver->vd_ndx == VER_NDX_UNSPECIFIED) Fatal(ctx) << *this << ": symbol version too large"; - if (ret.size() <= ver->vd_ndx) - ret.resize(ver->vd_ndx + 1); + if (vec.size() <= ver->vd_ndx) + vec.resize(ver->vd_ndx + 1); - ElfVerdaux *aux = (ElfVerdaux *)((u8 *)ver + ver->vd_aux); - ret[ver->vd_ndx] = strtab.data() + aux->vda_name; + ElfVerdaux *aux = (ElfVerdaux *)(ptr + ver->vd_aux); + vec[ver->vd_ndx] = strtab.data() + aux->vda_name; if (!ver->vd_next) break; - - ver = (ElfVerdef *)((u8 *)ver + ver->vd_next); + ptr += ver->vd_next; } - return ret; + return vec; } template @@ -1376,7 +1352,8 @@ void SharedFile::resolve_symbols(Context &ctx) { for (i64 i = 0; i < this->symbols.size(); i++) { Symbol &sym = *this->symbols[i]; const ElfSym &esym = this->elf_syms[i]; - if (esym.is_undef()) + + if (esym.is_undef() || sym.skip_dso) continue; std::scoped_lock lock(sym.mu); @@ -1387,7 +1364,7 @@ void SharedFile::resolve_symbols(Context &ctx) { sym.value = esym.st_value; sym.sym_idx = i; sym.ver_idx = versyms[i]; - sym.is_weak = false; + sym.is_weak = true; } } } @@ -1403,27 +1380,27 @@ SharedFile::mark_live_objects(Context &ctx, if (sym.is_traced) print_trace_symbol(ctx, *this, esym, sym); - if (esym.is_undef() && !esym.is_weak() && sym.file && !sym.file->is_dso && + if (esym.is_undef() && !esym.is_weak() && sym.file && !sym.file->is_alive.test_and_set()) { feeder(sym.file); if (sym.is_traced) - SyncOut(ctx) << "trace-symbol: " << *this << " keeps " << *sym.file - << " for " << sym; + Out(ctx) << "trace-symbol: " << *this << " keeps " << *sym.file + << " for " << sym; } } } template -std::span *> SharedFile::find_aliases(Symbol *sym) { +std::span *> SharedFile::get_symbols_at(Symbol *sym) { assert(sym->file == this); - std::call_once(init_aliases, [&] { + std::call_once(init_sorted_syms, [&] { for (Symbol *sym : this->symbols) if (sym->file == this) - aliases.push_back(sym); + sorted_syms.push_back(sym); - tbb::parallel_sort(aliases.begin(), aliases.end(), + tbb::parallel_sort(sorted_syms.begin(), sorted_syms.end(), [](Symbol *a, Symbol *b) { const ElfSym &x = a->esym(); const ElfSym &y = b->esym(); @@ -1431,12 +1408,12 @@ std::span *> SharedFile::find_aliases(Symbol *sym) { }); }); - auto [begin, end] = std::equal_range(aliases.begin(), aliases.end(), sym, - [&](Symbol *x, Symbol *y) { - return x->esym().st_value < y->esym().st_value; + auto [begin, end] = std::equal_range(sorted_syms.begin(), sorted_syms.end(), + sym, [&](Symbol *a, Symbol *b) { + return a->esym().st_value < b->esym().st_value; }); - return {&*begin, &*end}; + return {&*begin, (size_t)(end - begin)}; } // Infer an alignment of a DSO symbol. An alignment of a symbol in other @@ -1469,9 +1446,6 @@ bool SharedFile::is_readonly(Symbol *sym) { template void SharedFile::compute_symtab_size(Context &ctx) { - if (ctx.arg.strip_all) - return; - this->output_sym_indices.resize(this->elf_syms.size(), -1); // Compute the size of global symbols. @@ -1495,9 +1469,8 @@ void SharedFile::populate_symtab(Context &ctx) { u8 *strtab = ctx.buf + ctx.strtab->shdr.sh_offset; i64 strtab_off = this->strtab_offset; - for (i64 i = 0, j = this->first_global; j < this->elf_syms.size(); i++, j++) { - Symbol &sym = *this->symbols[j]; - if (sym.file != this || !sym.write_to_symtab) + for (i64 i = 0; Symbol *sym : this->get_global_syms()) { + if (sym->file != this || !sym->write_to_symtab) continue; U32 *xindex = nullptr; @@ -1505,8 +1478,9 @@ void SharedFile::populate_symtab(Context &ctx) { xindex = (U32 *)(ctx.buf + ctx.symtab_shndx->shdr.sh_offset) + this->global_symtab_idx + i; - *symtab++ = to_output_esym(ctx, sym, strtab_off, xindex); - strtab_off += write_string(strtab + strtab_off, sym.name()); + *symtab++ = to_output_esym(ctx, *sym, strtab_off, xindex); + strtab_off += write_string(strtab + strtab_off, sym->name()); + i++; } } @@ -1515,6 +1489,10 @@ using E = MOLD_TARGET; template class InputFile; template class ObjectFile; template class SharedFile; +template Symbol *get_symbol(Context &, std::string_view, std::string_view); +template Symbol *get_symbol(Context &, std::string_view); +template std::string_view demangle(const Symbol &); +template std::ostream &operator<<(std::ostream &, const Symbol &); template std::ostream &operator<<(std::ostream &, const InputFile &); -} // namespace mold::elf +} // namespace mold diff --git a/src/input-sections.cc b/src/input-sections.cc new file mode 100644 index 00000000..399c80a9 --- /dev/null +++ b/src/input-sections.cc @@ -0,0 +1,440 @@ +#include "mold.h" + +#include +#include +#include + +namespace mold { + +static i64 to_p2align(u64 alignment) { + if (alignment == 0) + return 0; + return std::countr_zero(alignment); +} + +template +bool cie_equals(const CieRecord &a, const CieRecord &b) { + if (a.get_contents() != b.get_contents()) + return false; + + std::span> x = a.get_rels(); + std::span> y = b.get_rels(); + if (x.size() != y.size()) + return false; + + for (i64 i = 0; i < x.size(); i++) + if (x[i].r_offset - a.input_offset != y[i].r_offset - b.input_offset || + x[i].r_type != y[i].r_type || + a.file.symbols[x[i].r_sym] != b.file.symbols[y[i].r_sym] || + get_addend(a.input_section, x[i]) != get_addend(b.input_section, y[i])) + return false; + return true; +} + +template +InputSection::InputSection(Context &ctx, ObjectFile &file, i64 shndx) + : file(file), shndx(shndx) { + if (shndx < file.elf_sections.size()) + contents = {(char *)file.mf->data + shdr().sh_offset, (size_t)shdr().sh_size}; + + if (shdr().sh_flags & SHF_COMPRESSED) { + ElfChdr &chdr = *(ElfChdr *)&contents[0]; + sh_size = chdr.ch_size; + p2align = to_p2align(chdr.ch_addralign); + } else { + sh_size = shdr().sh_size; + p2align = to_p2align(shdr().sh_addralign); + } + + // Sections may have been compressed. We usually uncompress them + // directly into the mmap'ed output file, but we want to uncompress + // early for REL-type ELF types to read relocation addends from + // section contents. For RELA-type, we don't need to do this because + // addends are in relocations. + // + // SH-4 stores addends to sections despite being RELA, which is a + // special (and buggy) case. + if constexpr (!E::is_rela || is_sh4) + uncompress(ctx); +} + +template +void InputSection::uncompress(Context &ctx) { + if (!(shdr().sh_flags & SHF_COMPRESSED) || uncompressed) + return; + + u8 *buf = new u8[sh_size]; + copy_contents(ctx, buf); + contents = std::string_view((char *)buf, sh_size); + ctx.string_pool.emplace_back(buf); + uncompressed = true; +} + +template +void InputSection::copy_contents(Context &ctx, u8 *buf) { + if (!(shdr().sh_flags & SHF_COMPRESSED) || uncompressed) { + memcpy(buf, contents.data(), contents.size()); + return; + } + + if (contents.size() < sizeof(ElfChdr)) + Fatal(ctx) << *this << ": corrupted compressed section"; + + ElfChdr &hdr = *(ElfChdr *)&contents[0]; + std::string_view data = contents.substr(sizeof(ElfChdr)); + + switch (hdr.ch_type) { + case ELFCOMPRESS_ZLIB: { + unsigned long size = sh_size; + if (::uncompress(buf, &size, (u8 *)data.data(), data.size()) != Z_OK) + Fatal(ctx) << *this << ": uncompress failed"; + assert(size == sh_size); + break; + } + case ELFCOMPRESS_ZSTD: + if (ZSTD_decompress(buf, sh_size, (u8 *)data.data(), data.size()) != sh_size) + Fatal(ctx) << *this << ": ZSTD_decompress failed"; + break; + default: + Fatal(ctx) << *this << ": unsupported compression type: 0x" + << std::hex << hdr.ch_type; + } +} + +typedef enum : u8 { NONE, ERROR, COPYREL, PLT, CPLT } Action; + +template +static void do_action(Context &ctx, Action action, InputSection &isec, + Symbol &sym, const ElfRel &rel) { + switch (action) { + case NONE: + break; + case ERROR: + Error(ctx) << isec << ": " << rel << " relocation at offset 0x" + << std::hex << rel.r_offset << " against symbol `" + << sym << "' can not be used; recompile with -fPIC"; + break; + case COPYREL: + // Create a copy relocation + sym.flags |= NEEDS_COPYREL; + break; + case PLT: + // Create a PLT entry + sym.flags |= NEEDS_PLT; + break; + case CPLT: + // Create a canonical PLT entry + sym.flags |= NEEDS_CPLT; + break; + } +} + +template +static inline i64 get_output_type(Context &ctx) { + if (ctx.arg.shared) + return 0; + if (ctx.arg.pie) + return 1; + return 2; +} + +template +static inline i64 get_sym_type(Symbol &sym) { + if (sym.is_absolute()) + return 0; + if (!sym.is_imported) + return 1; + if (sym.get_type() != STT_FUNC) + return 2; + return 3; +} + +template +void InputSection::scan_pcrel(Context &ctx, Symbol &sym, + const ElfRel &rel) { + // This is for PC-relative relocations (e.g. R_X86_64_PC32). + // We cannot promote them to dynamic relocations because the dynamic + // linker generally does not support PC-relative relocations. + static Action table[][4] = { + // Absolute Local Imported data Imported code + { ERROR, NONE, ERROR, PLT }, // Shared object + { ERROR, NONE, COPYREL, CPLT }, // Position-independent exec + { NONE, NONE, COPYREL, CPLT }, // Position-dependent exec + }; + + Action action = table[get_output_type(ctx)][get_sym_type(sym)]; + do_action(ctx, action, *this, sym, rel); +} + +template +void InputSection::scan_absrel(Context &ctx, Symbol &sym, + const ElfRel &rel) { + // This is a decision table for absolute relocations that is smaller + // than the pointer size (e.g. R_X86_64_32). Since the dynamic linker + // generally does not support dynamic relocations smaller than the + // pointer size, we need to report an error if a relocation cannot be + // resolved at link-time. + static Action table[][4] = { + // Absolute Local Imported data Imported code + { NONE, ERROR, ERROR, ERROR }, // Shared object + { NONE, ERROR, ERROR, ERROR }, // Position-independent exec + { NONE, NONE, COPYREL, CPLT }, // Position-dependent exec + }; + + Action action = table[get_output_type(ctx)][get_sym_type(sym)]; + do_action(ctx, action, *this, sym, rel); +} + +template +void InputSection::scan_tlsdesc(Context &ctx, Symbol &sym) { + if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) { + // Relax TLSDESC to Local Exec. In this case, we directly materialize + // a TP-relative offset, so no dynamic relocation is needed. + // + // TLSDESC relocs must always be relaxed for statically-linked + // executables even if -no-relax is given. It is because a + // statically-linked executable doesn't contain a trampoline + // function needed for TLSDESC. + } else if (ctx.arg.relax && sym.is_tprel_runtime_const(ctx)) { + // In this condition, TP-relative offset of a thread-local variable + // is known at process startup time, so we can relax TLSDESC to the + // code that reads the TP-relative offset from GOT and add TP to it. + sym.flags |= NEEDS_GOTTP; + } else { + // If no relaxation is doable, we simply create a TLSDESC dynamic + // relocation. + sym.flags |= NEEDS_TLSDESC; + } +} + +template +void InputSection::check_tlsle(Context &ctx, Symbol &sym, + const ElfRel &rel) { + if (ctx.arg.shared) + Error(ctx) << *this << ": relocation " << rel << " against `" << sym + << "` can not be used when making a shared object;" + << " recompile with -fPIC"; +} + +template +void InputSection::write_to(Context &ctx, u8 *buf) { + if (shdr().sh_type == SHT_NOBITS || sh_size == 0) + return; + + // Copy data. In RISC-V and LoongArch object files, sections are not + // atomic unit of copying because of relaxation. That is, some + // relocations are allowed to remove bytes from the middle of a + // section and shrink the overall size of it. + if constexpr (is_riscv || is_loongarch) { + if (extra.r_deltas.empty()) { + // If a section is not relaxed, we can copy it as a one big chunk. + copy_contents(ctx, buf); + } else { + // A relaxed section is copied piece-wise. + std::span> rels = get_rels(ctx); + u8 *buf2 = buf; + i64 pos = 0; + + for (i64 i = 0; i < rels.size(); i++) { + i64 delta = extra.r_deltas[i + 1] - extra.r_deltas[i]; + if (delta == 0) + continue; + assert(delta > 0); + + const ElfRel &r = rels[i]; + memcpy(buf2, contents.data() + pos, r.r_offset - pos); + buf2 += r.r_offset - pos; + pos = r.r_offset + delta; + } + memcpy(buf2, contents.data() + pos, contents.size() - pos); + } + } else { + copy_contents(ctx, buf); + } + + // Apply relocations + if (!ctx.arg.relocatable) { + if (shdr().sh_flags & SHF_ALLOC) + apply_reloc_alloc(ctx, buf); + else + apply_reloc_nonalloc(ctx, buf); + } +} + +// Get the name of a function containin a given offset. +template +std::string_view +InputSection::get_func_name(Context &ctx, i64 offset) const { + for (Symbol *sym : file.symbols) { + if (sym->file == &file) { + const ElfSym &esym = sym->esym(); + if (esym.st_shndx == shndx && esym.st_type == STT_FUNC && + esym.st_value <= offset && offset < esym.st_value + esym.st_size) { + if (ctx.arg.demangle) + return demangle(*sym); + return sym->name(); + } + } + } + return ""; +} + +// Test if the symbol a given relocation refers to has already been resolved. +// If not, record that error and returns true. +template +bool InputSection::record_undef_error(Context &ctx, const ElfRel &rel) { + // If a relocation refers to a linker-synthesized symbol for a + // section fragment, it's always been resolved. + if (file.elf_syms.size() <= rel.r_sym) + return false; + + Symbol &sym = *file.symbols[rel.r_sym]; + const ElfSym &esym = file.elf_syms[rel.r_sym]; + + // If a symbol is defined in a comdat group, and the comdat group is + // discarded, the symbol may not have an owner. It is technically an + // violation of the One Definition Rule, so it is a programmer's fault. + if (!sym.file) { + Error(ctx) << *this << ": " << sym << " refers to a discarded COMDAT section" + << " probably due to an ODR violation"; + return true; + } + + auto record = [&] { + std::stringstream ss; + if (std::string_view source = file.get_source_name(); !source.empty()) + ss << ">>> referenced by " << source << "\n"; + else + ss << ">>> referenced by " << *this << "\n"; + + ss << ">>> " << file; + if (std::string_view func = get_func_name(ctx, rel.r_offset); !func.empty()) + ss << ":(" << func << ")"; + ss << '\n'; + + typename decltype(ctx.undef_errors)::accessor acc; + ctx.undef_errors.insert(acc, {&sym, {}}); + acc->second.push_back(ss.str()); + }; + + // A non-weak undefined symbol must be promoted to an imported symbol + // or resolved to an defined symbol. Otherwise, we need to report an + // error or warn on it. + // + // Every ELF file has an absolute local symbol as its first symbol. + // Referring to that symbol is always valid. + bool is_undef = esym.is_undef() && !esym.is_weak() && sym.sym_idx; + + if (is_undef && sym.esym().is_undef()) { + if (ctx.arg.unresolved_symbols == UNRESOLVED_ERROR && !sym.is_imported) { + record(); + return true; + } + if (ctx.arg.unresolved_symbols == UNRESOLVED_WARN) { + record(); + return false; + } + } + + return false; +} + +template +MergeableSection::MergeableSection(Context &ctx, MergedSection &parent, + std::unique_ptr> &isec) + : parent(parent), section(std::move(isec)), p2align(section->p2align) { + section->uncompress(ctx); + + std::scoped_lock lock(parent.mu); + parent.members.push_back(this); +} + +static size_t find_null(std::string_view data, i64 pos, i64 entsize) { + if (entsize == 1) + return data.find('\0', pos); + + for (; pos <= data.size() - entsize; pos += entsize) + if (data.substr(pos, entsize).find_first_not_of('\0') == data.npos) + return pos; + + return data.npos; +} + +// Mergeable sections (sections with SHF_MERGE bit) typically contain +// string literals. Linker is expected to split the section contents +// into null-terminated strings, merge them with mergeable strings +// from other object files, and emit uniquified strings to an output +// file. +// +// This mechanism reduces the size of an output file. If two source +// files happen to contain the same string literal, the output will +// contain only a single copy of it. +// +// It is less common than string literals, but mergeable sections can +// contain fixed-sized read-only records too. +// +// This function splits the section contents into small pieces that we +// call "section fragments". Section fragment is a unit of merging. +// +// We do not support mergeable sections that have relocations. +template +void MergeableSection::split_contents(Context &ctx) { + std::string_view data = section->contents; + if (data.size() > UINT32_MAX) + Fatal(ctx) << *section + << ": mergeable section too large"; + + i64 entsize = parent.shdr.sh_entsize; + + // Split sections + if (parent.shdr.sh_flags & SHF_STRINGS) { + for (i64 pos = 0; pos < data.size();) { + frag_offsets.push_back(pos); + size_t end = find_null(data, pos, entsize); + if (end == data.npos) + Fatal(ctx) << *section << ": string is not null terminated"; + pos = end + entsize; + } + } else { + if (data.size() % entsize) + Fatal(ctx) << *section << ": section size is not multiple of sh_entsize"; + frag_offsets.reserve(data.size() / entsize); + + for (i64 pos = 0; pos < data.size(); pos += entsize) + frag_offsets.push_back(pos); + } + + // Compute hashes for section pieces + HyperLogLog estimator; + hashes.reserve(frag_offsets.size()); + + for (i64 i = 0; i < frag_offsets.size(); i++) { + u64 hash = hash_string(get_contents(i)); + hashes.push_back(hash); + estimator.insert(hash); + } + + parent.estimator.merge(estimator); + + static Counter counter("string_fragments"); + counter += frag_offsets.size(); +} + +template +void MergeableSection::resolve_contents(Context &ctx) { + fragments.reserve(frag_offsets.size()); + for (i64 i = 0; i < frag_offsets.size(); i++) + fragments.push_back(parent.insert(ctx, get_contents(i), hashes[i], p2align)); + + // Reclaim memory as we'll never use this vector again + hashes.clear(); + hashes.shrink_to_fit(); +} + +using E = MOLD_TARGET; + +template bool cie_equals(const CieRecord &, const CieRecord &); +template class InputSection; +template class MergeableSection; + +} // namespace mold diff --git a/src/linker-script.cc b/src/linker-script.cc new file mode 100644 index 00000000..6fe5dab6 --- /dev/null +++ b/src/linker-script.cc @@ -0,0 +1,424 @@ +// On Linux, /usr/lib/x86_64-linux-gnu/libc.so is not actually +// a shared object file but an ASCII text file containing a linker +// script to include a "real" libc.so file. Therefore, we need to +// support a (very limited) subset of the linker script language. + +#include "mold.h" + +#include +#include + +namespace mold { + +static std::string_view get_line(std::string_view input, const char *pos) { + assert(input.data() <= pos); + assert(pos < input.data() + input.size()); + + i64 start = input.rfind('\n', pos - input.data()); + if (start == input.npos) + start = 0; + else + start++; + + i64 end = input.find('\n', pos - input.data()); + if (end == input.npos) + end = input.size(); + + return input.substr(start, end - start); +} + +template +void Script::error(std::string_view pos, std::string msg) { + std::string_view input = mf->get_contents(); + std::string_view line = get_line(input, pos.data()); + + i64 lineno = 1; + for (i64 i = 0; input.data() + i < line.data(); i++) + if (input[i] == '\n') + lineno++; + + std::string label = mf->name + ":" + std::to_string(lineno) + ": "; + i64 indent = strlen("mold: fatal: ") + label.size(); + i64 column = pos.data() - line.data(); + + Fatal(ctx) << label << line << "\n" + << std::string(indent + column, ' ') << "^ " << msg; +} + +template +void Script::tokenize() { + std::string_view input = mf->get_contents(); + + while (!input.empty()) { + if (isspace(input[0])) { + input = input.substr(1); + continue; + } + + if (input.starts_with("/*")) { + i64 pos = input.find("*/", 2); + if (pos == std::string_view::npos) + error(input, "unclosed comment"); + input = input.substr(pos + 2); + continue; + } + + if (input[0] == '#') { + i64 pos = input.find("\n", 1); + if (pos == std::string_view::npos) + break; + input = input.substr(pos + 1); + continue; + } + + if (input[0] == '"') { + i64 pos = input.find('"', 1); + if (pos == std::string_view::npos) + error(input, "unclosed string literal"); + tokens.push_back(input.substr(0, pos + 1)); + input = input.substr(pos + 1); + continue; + } + + i64 pos = input.find_first_not_of( + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + "0123456789_.$/\\~=+[]*?-!^:"); + + if (pos == 0) + pos = 1; + else if (pos == input.npos) + pos = input.size(); + + tokens.push_back(input.substr(0, pos)); + input = input.substr(pos); + } +} + +template +std::span +Script::skip(std::span tok, std::string_view str) { + if (tok.empty()) + Fatal(ctx) << mf->name << ": expected '" << str << "', but got EOF"; + if (tok[0] != str) + error(tok[0], "expected '" + std::string(str) + "'"); + return tok.subspan(1); +} + +static std::string_view unquote(std::string_view s) { + if (s.size() > 0 && s[0] == '"') { + assert(s[s.size() - 1] == '"'); + return s.substr(1, s.size() - 2); + } + return s; +} + +template +std::span +Script::read_output_format(std::span tok) { + tok = skip(tok, "("); + while (!tok.empty() && tok[0] != ")") + tok = tok.subspan(1); + if (tok.empty()) + Fatal(ctx) << mf->name << ": expected ')', but got EOF"; + return tok.subspan(1); +} + +template +static bool is_in_sysroot(Context &ctx, std::string path) { + std::filesystem::path sysroot = to_abs_path(ctx.arg.sysroot); + std::string rel = to_abs_path(path).lexically_relative(sysroot).string(); + return rel != "." && !rel.starts_with("../"); +} + +template +MappedFile *Script::resolve_path(std::string_view tok, bool check_target) { + std::string str(unquote(tok)); + + auto open = [&](const std::string &path) -> MappedFile * { + MappedFile *mf = open_file(ctx, path); + if (!mf) + return nullptr; + + if (check_target) { + std::string_view target = get_machine_type(ctx, rctx, mf); + if (!target.empty() && target != E::target_name) { + Warn(ctx) << path << ": skipping incompatible file: " << target + << " (e_machine " << (int)E::e_machine << ")"; + return nullptr; + } + } + return mf; + }; + + // GNU ld prepends the sysroot if a pathname starts with '/' and the + // script being processed is in the sysroot. We do the same. + if (str.starts_with('/') && is_in_sysroot(ctx, mf->name)) + return must_open_file(ctx, ctx.arg.sysroot + str); + + if (str.starts_with('=')) { + std::string path; + if (ctx.arg.sysroot.empty()) + path = str.substr(1); + else + path = ctx.arg.sysroot + str.substr(1); + return must_open_file(ctx, path); + } + + if (str.starts_with("-l")) + return find_library(ctx, rctx, str.substr(2)); + + if (!str.starts_with('/')) + if (MappedFile *mf2 = open(path_clean(mf->name + "/../" + str))) + return mf2; + + if (MappedFile *mf = open(str)) + return mf; + + for (std::string_view dir : ctx.arg.library_paths) { + std::string path = std::string(dir) + "/" + str; + if (MappedFile *mf = open(path)) + return mf; + } + + error(tok, "library not found: " + str); +} + +template +std::span +Script::read_group(std::span tok) { + tok = skip(tok, "("); + + while (!tok.empty() && tok[0] != ")") { + if (tok[0] == "AS_NEEDED") { + bool orig = rctx.as_needed; + rctx.as_needed = true; + tok = read_group(tok.subspan(1)); + rctx.as_needed = orig; + continue; + } + + MappedFile *mf = resolve_path(tok[0], true); + read_file(ctx, rctx, mf); + tok = tok.subspan(1); + } + + if (tok.empty()) + Fatal(ctx) << mf->name << ": expected ')', but got EOF"; + return tok.subspan(1); +} + +template +void Script::parse_linker_script() { + std::call_once(once, [&] { tokenize(); }); + std::span tok = tokens; + + while (!tok.empty()) { + if (tok[0] == "OUTPUT_FORMAT") { + tok = read_output_format(tok.subspan(1)); + } else if (tok[0] == "INPUT" || tok[0] == "GROUP") { + tok = read_group(tok.subspan(1)); + } else if (tok[0] == "VERSION") { + tok = tok.subspan(1); + tok = skip(tok, "{"); + tok = read_version_script(tok); + tok = skip(tok, "}"); + } else if (tok.size() > 3 && tok[1] == "=" && tok[3] == ";") { + ctx.arg.defsyms.emplace_back(get_symbol(ctx, unquote(tok[0])), + get_symbol(ctx, unquote(tok[2]))); + tok = tok.subspan(4); + } else if (tok[0] == ";") { + tok = tok.subspan(1); + } else { + error(tok[0], "unknown linker script token"); + } + } +} + +template +std::string_view Script::get_script_output_type() { + std::call_once(once, [&] { tokenize(); }); + std::span tok = tokens; + + if (tok.size() >= 3 && tok[0] == "OUTPUT_FORMAT" && tok[1] == "(") { + if (tok[2] == "elf64-x86-64") + return X86_64::target_name; + if (tok[2] == "elf32-i386") + return I386::target_name; + } + + if (tok.size() >= 3 && (tok[0] == "INPUT" || tok[0] == "GROUP") && + tok[1] == "(") + if (MappedFile *mf = resolve_path(tok[2], false)) + return get_machine_type(ctx, rctx, mf); + return ""; +} + +static bool read_label(std::span &tok, std::string label) { + if (tok.size() >= 1 && tok[0] == label + ":") { + tok = tok.subspan(1); + return true; + } + + if (tok.size() >= 2 && tok[0] == label && tok[1] == ":") { + tok = tok.subspan(2); + return true; + } + return false; +} + +template +std::span +Script::read_version_script_commands(std::span tok, + std::string_view ver_str, u16 ver_idx, + bool is_global, bool is_cpp) { + while (!tok.empty() && tok[0] != "}") { + if (read_label(tok, "global")) { + is_global = true; + continue; + } + + if (read_label(tok, "local")) { + is_global = false; + continue; + } + + if (tok[0] == "extern") { + tok = tok.subspan(1); + + if (!tok.empty() && tok[0] == "\"C\"") { + tok = tok.subspan(1); + tok = skip(tok, "{"); + tok = read_version_script_commands(tok, ver_str, ver_idx, is_global, false); + } else { + tok = skip(tok, "\"C++\""); + tok = skip(tok, "{"); + tok = read_version_script_commands(tok, ver_str, ver_idx, is_global, true); + } + + tok = skip(tok, "}"); + tok = skip(tok, ";"); + continue; + } + + if (tok[0] == "*") { + ctx.default_version = (is_global ? ver_idx : (u32)VER_NDX_LOCAL); + } else if (is_global) { + ctx.version_patterns.push_back({unquote(tok[0]), mf->name, ver_str, + ver_idx, is_cpp}); + } else { + ctx.version_patterns.push_back({unquote(tok[0]), mf->name, ver_str, + VER_NDX_LOCAL, is_cpp}); + } + + tok = tok.subspan(1); + + if (!tok.empty() && tok[0] == "}") + break; + tok = skip(tok, ";"); + } + return tok; +} + +template +std::span +Script::read_version_script(std::span tok) { + u16 next_ver = VER_NDX_LAST_RESERVED + ctx.arg.version_definitions.size() + 1; + + while (!tok.empty() && tok[0] != "}") { + std::string_view ver_str; + u16 ver_idx; + + if (tok[0] == "{") { + ver_str = "global"; + ver_idx = VER_NDX_GLOBAL; + } else { + ver_str = tok[0]; + ver_idx = next_ver++; + ctx.arg.version_definitions.push_back(std::string(tok[0])); + tok = tok.subspan(1); + } + + tok = skip(tok, "{"); + tok = read_version_script_commands(tok, ver_str, ver_idx, true, false); + tok = skip(tok, "}"); + if (!tok.empty() && tok[0] != ";") + tok = tok.subspan(1); + tok = skip(tok, ";"); + } + return tok; +} + +template +void Script::parse_version_script() { + std::call_once(once, [&] { tokenize(); }); + std::span tok = tokens; + tok = read_version_script(tok); + if (!tok.empty()) + error(tok[0], "trailing garbage token"); +} + +template +std::span +Script::read_dynamic_list_commands(std::span tok, + std::vector &result, + bool is_cpp) { + while (!tok.empty() && tok[0] != "}") { + if (tok[0] == "extern") { + tok = tok.subspan(1); + + if (!tok.empty() && tok[0] == "\"C\"") { + tok = tok.subspan(1); + tok = skip(tok, "{"); + tok = read_dynamic_list_commands(tok, result, false); + } else { + tok = skip(tok, "\"C++\""); + tok = skip(tok, "{"); + tok = read_dynamic_list_commands(tok, result, true); + } + + tok = skip(tok, "}"); + tok = skip(tok, ";"); + continue; + } + + result.push_back({unquote(tok[0]), "", is_cpp}); + tok = skip(tok.subspan(1), ";"); + } + return tok; +} + +template +std::vector Script::parse_dynamic_list() { + std::call_once(once, [&] { tokenize(); }); + std::span tok = tokens; + std::vector result; + + tok = skip(tok, "{"); + tok = read_dynamic_list_commands(tok, result, false); + tok = skip(tok, "}"); + tok = skip(tok, ";"); + + if (!tok.empty()) + error(tok[0], "trailing garbage token"); + + for (DynamicPattern &p : result) + p.source = mf->name; + return result; +} + +template +std::vector +parse_dynamic_list(Context &ctx, std::string_view path) { + ReaderContext rctx; + MappedFile *mf = must_open_file(ctx, std::string(path)); + return Script(ctx, rctx, mf).parse_dynamic_list(); +} + +using E = MOLD_TARGET; + +template class Script; + +template +std::vector parse_dynamic_list(Context &, std::string_view); + +} // namespace mold diff --git a/elf/lto-unix.cc b/src/lto-unix.cc similarity index 78% rename from elf/lto-unix.cc rename to src/lto-unix.cc index b2114a15..740842f8 100644 --- a/elf/lto-unix.cc +++ b/src/lto-unix.cc @@ -95,7 +95,7 @@ # define LOG std::ostringstream() #endif -namespace mold::elf { +namespace mold { // Global variables // We store LTO-related information to global variables, @@ -126,7 +126,7 @@ static PluginStatus message(PluginLevel level, const char *fmt, ...) { switch (level) { case LDPL_INFO: - SyncOut(ctx) << buf; + Out(ctx) << buf; break; case LDPL_WARNING: Warn(ctx) << buf; @@ -176,7 +176,7 @@ static PluginStatus add_input_file(const char *path) { Context &ctx = *gctx; static i64 file_priority = 100; - MappedFile> *mf = MappedFile>::must_open(ctx, path); + MappedFile *mf = must_open_file(ctx, path); ObjectFile *file = ObjectFile::create(ctx, mf, "", false); ctx.obj_pool.emplace_back(file); @@ -200,10 +200,7 @@ static PluginStatus release_input_file(const void *handle) { LOG << "release_input_file\n"; ObjectFile &file = *(ObjectFile *)handle; - if (file.mf->fd != -1) { - close(file.mf->fd); - file.mf->fd = -1; - } + file.mf->close_fd(); return LDPS_OK; } @@ -427,7 +424,7 @@ get_api_version(const char *plugin_identifier, if (LAPI_V1 < minimal_api_supported) Fatal(*gctx) << "LTO plugin does not support V0 or V1 API"; - std::string version = mold_version + "\0"s; + std::string version = get_mold_version() + "\0"s; *linker_identifier = "mold"; *linker_version = version.data(); @@ -439,72 +436,77 @@ get_api_version(const char *plugin_identifier, return LAPI_V0; } +// dlopen the linker plugin file template -static void load_plugin(Context &ctx) { - assert(phase == 0); - phase = 1; - gctx = &ctx; - - void *handle = dlopen(ctx.arg.plugin.c_str(), RTLD_NOW | RTLD_GLOBAL); - if (!handle) - Fatal(ctx) << "could not open plugin file: " << dlerror(); - - OnloadFn *onload = (OnloadFn *)dlsym(handle, "onload"); - if (!onload) - Fatal(ctx) << "failed to load plugin " << ctx.arg.plugin << ": " - << dlerror(); - - auto save = [&](std::string_view str) { - return save_string(ctx, std::string(str).c_str()).data(); - }; +static void load_lto_plugin(Context &ctx) { + static std::once_flag flag; - std::vector tv; - tv.emplace_back(LDPT_MESSAGE, message); - - if (ctx.arg.shared) - tv.emplace_back(LDPT_LINKER_OUTPUT, LDPO_DYN); - else if (ctx.arg.pie) - tv.emplace_back(LDPT_LINKER_OUTPUT, LDPO_PIE); - else - tv.emplace_back(LDPT_LINKER_OUTPUT, LDPO_EXEC); - - for (std::string_view opt : ctx.arg.plugin_opt) - tv.emplace_back(LDPT_OPTION, save(opt)); - - tv.emplace_back(LDPT_REGISTER_CLAIM_FILE_HOOK, register_claim_file_hook); - tv.emplace_back(LDPT_REGISTER_ALL_SYMBOLS_READ_HOOK, - register_all_symbols_read_hook); - tv.emplace_back(LDPT_REGISTER_CLEANUP_HOOK, register_cleanup_hook); - tv.emplace_back(LDPT_ADD_SYMBOLS, add_symbols); - tv.emplace_back(LDPT_GET_SYMBOLS, get_symbols_v1); - tv.emplace_back(LDPT_ADD_INPUT_FILE, add_input_file); - tv.emplace_back(LDPT_GET_INPUT_FILE, get_input_file); - tv.emplace_back(LDPT_RELEASE_INPUT_FILE, release_input_file); - tv.emplace_back(LDPT_ADD_INPUT_LIBRARY, add_input_library); - tv.emplace_back(LDPT_OUTPUT_NAME, save(ctx.arg.output)); - tv.emplace_back(LDPT_SET_EXTRA_LIBRARY_PATH, set_extra_library_path); - tv.emplace_back(LDPT_GET_VIEW, get_view); - tv.emplace_back(LDPT_GET_INPUT_SECTION_COUNT, get_input_section_count); - tv.emplace_back(LDPT_GET_INPUT_SECTION_TYPE, get_input_section_type); - tv.emplace_back(LDPT_GET_INPUT_SECTION_NAME, get_input_section_name); - tv.emplace_back(LDPT_GET_INPUT_SECTION_CONTENTS, get_input_section_contents); - tv.emplace_back(LDPT_UPDATE_SECTION_ORDER, update_section_order); - tv.emplace_back(LDPT_ALLOW_SECTION_ORDERING, allow_section_ordering); - tv.emplace_back(LDPT_ADD_SYMBOLS_V2, add_symbols); - tv.emplace_back(LDPT_GET_SYMBOLS_V2, get_symbols_v2); - tv.emplace_back(LDPT_ALLOW_UNIQUE_SEGMENT_FOR_SECTIONS, - allow_unique_segment_for_sections); - tv.emplace_back(LDPT_UNIQUE_SEGMENT_FOR_SECTIONS, unique_segment_for_sections); - tv.emplace_back(LDPT_GET_SYMBOLS_V3, get_symbols_v3); - tv.emplace_back(LDPT_GET_INPUT_SECTION_ALIGNMENT, get_input_section_alignment); - tv.emplace_back(LDPT_GET_INPUT_SECTION_SIZE, get_input_section_size); - tv.emplace_back(LDPT_REGISTER_NEW_INPUT_HOOK, register_new_input_hook); - tv.emplace_back(LDPT_GET_WRAP_SYMBOLS, get_wrap_symbols); - tv.emplace_back(LDPT_GET_API_VERSION, get_api_version); - tv.emplace_back(LDPT_NULL, 0); - - [[maybe_unused]] PluginStatus status = onload(tv.data()); - assert(status == LDPS_OK); + std::call_once(flag, [&] { + assert(phase == 0); + phase = 1; + gctx = &ctx; + + void *handle = dlopen(ctx.arg.plugin.c_str(), RTLD_NOW | RTLD_LOCAL); + if (!handle) + Fatal(ctx) << "could not open plugin file: " << dlerror(); + + OnloadFn *onload = (OnloadFn *)dlsym(handle, "onload"); + if (!onload) + Fatal(ctx) << "failed to load plugin " << ctx.arg.plugin << ": " + << dlerror(); + + auto save = [&](std::string_view str) { + return save_string(ctx, std::string(str).c_str()).data(); + }; + + std::vector tv; + tv.emplace_back(LDPT_MESSAGE, message); + + if (ctx.arg.shared) + tv.emplace_back(LDPT_LINKER_OUTPUT, LDPO_DYN); + else if (ctx.arg.pie) + tv.emplace_back(LDPT_LINKER_OUTPUT, LDPO_PIE); + else + tv.emplace_back(LDPT_LINKER_OUTPUT, LDPO_EXEC); + + for (std::string_view opt : ctx.arg.plugin_opt) + tv.emplace_back(LDPT_OPTION, save(opt)); + + tv.emplace_back(LDPT_REGISTER_CLAIM_FILE_HOOK, register_claim_file_hook); + tv.emplace_back(LDPT_REGISTER_ALL_SYMBOLS_READ_HOOK, + register_all_symbols_read_hook); + tv.emplace_back(LDPT_REGISTER_CLEANUP_HOOK, register_cleanup_hook); + tv.emplace_back(LDPT_ADD_SYMBOLS, add_symbols); + tv.emplace_back(LDPT_GET_SYMBOLS, get_symbols_v1); + tv.emplace_back(LDPT_ADD_INPUT_FILE, add_input_file); + tv.emplace_back(LDPT_GET_INPUT_FILE, get_input_file); + tv.emplace_back(LDPT_RELEASE_INPUT_FILE, release_input_file); + tv.emplace_back(LDPT_ADD_INPUT_LIBRARY, add_input_library); + tv.emplace_back(LDPT_OUTPUT_NAME, save(ctx.arg.output)); + tv.emplace_back(LDPT_SET_EXTRA_LIBRARY_PATH, set_extra_library_path); + tv.emplace_back(LDPT_GET_VIEW, get_view); + tv.emplace_back(LDPT_GET_INPUT_SECTION_COUNT, get_input_section_count); + tv.emplace_back(LDPT_GET_INPUT_SECTION_TYPE, get_input_section_type); + tv.emplace_back(LDPT_GET_INPUT_SECTION_NAME, get_input_section_name); + tv.emplace_back(LDPT_GET_INPUT_SECTION_CONTENTS, get_input_section_contents); + tv.emplace_back(LDPT_UPDATE_SECTION_ORDER, update_section_order); + tv.emplace_back(LDPT_ALLOW_SECTION_ORDERING, allow_section_ordering); + tv.emplace_back(LDPT_ADD_SYMBOLS_V2, add_symbols); + tv.emplace_back(LDPT_GET_SYMBOLS_V2, get_symbols_v2); + tv.emplace_back(LDPT_ALLOW_UNIQUE_SEGMENT_FOR_SECTIONS, + allow_unique_segment_for_sections); + tv.emplace_back(LDPT_UNIQUE_SEGMENT_FOR_SECTIONS, unique_segment_for_sections); + tv.emplace_back(LDPT_GET_SYMBOLS_V3, get_symbols_v3); + tv.emplace_back(LDPT_GET_INPUT_SECTION_ALIGNMENT, get_input_section_alignment); + tv.emplace_back(LDPT_GET_INPUT_SECTION_SIZE, get_input_section_size); + tv.emplace_back(LDPT_REGISTER_NEW_INPUT_HOOK, register_new_input_hook); + tv.emplace_back(LDPT_GET_WRAP_SYMBOLS, get_wrap_symbols); + tv.emplace_back(LDPT_GET_API_VERSION, get_api_version); + tv.emplace_back(LDPT_NULL, 0); + + [[maybe_unused]] PluginStatus status = onload(tv.data()); + assert(status == LDPS_OK); + }); } template @@ -565,7 +567,7 @@ static ElfSym to_elf_sym(PluginSymbol &psym) { // Returns false if it's GCC. template static bool is_llvm(Context &ctx) { - return ctx.arg.plugin.ends_with("LLVMgold.so"); + return ctx.arg.plugin.find("LLVMgold.") != ctx.arg.plugin.npos; } // Returns true if a given linker plugin supports the get_symbols_v3 API. @@ -576,7 +578,28 @@ static bool supports_v3_api(Context &ctx) { } template -ObjectFile *read_lto_object(Context &ctx, MappedFile> *mf) { +static PluginInputFile +create_plugin_input_file(Context &ctx, MappedFile *mf) { + PluginInputFile file; + MappedFile *mf2 = mf->parent ? mf->parent : mf; + + file.name = save_string(ctx, mf2->name).data(); + file.offset = mf->get_offset(); + file.filesize = mf->size; + + mf2->reopen_fd(file.name); + + file.fd = mf2->fd; + + if (!file.fd) + Fatal(ctx) << "cannot open " << file.name << ": " << errno_string(); + return file; +} + +template +ObjectFile *read_lto_object(Context &ctx, MappedFile *mf) { + load_lto_plugin(ctx); + // V0 API's claim_file is not thread-safe. static std::mutex mu; std::unique_lock lock(mu, std::defer_lock); @@ -589,10 +612,6 @@ ObjectFile *read_lto_object(Context &ctx, MappedFile> *mf) { << "added -flto not only for creating object files but also for " << "creating the final executable."; - // dlopen the linker plugin file - static std::once_flag flag; - std::call_once(flag, [&] { load_plugin(ctx); }); - // Create mold's object instance ObjectFile *obj = new ObjectFile; ctx.obj_pool.emplace_back(obj); @@ -602,23 +621,10 @@ ObjectFile *read_lto_object(Context &ctx, MappedFile> *mf) { obj->first_global = 1; obj->is_lto_obj = true; obj->mf = mf; + obj->archive_name = mf->parent ? mf->parent->name : ""; // Create plugin's object instance - PluginInputFile file = {}; - - MappedFile> *mf2 = mf->parent ? mf->parent : mf; - file.name = save_string(ctx, mf2->name).data(); - if (mf2->fd == -1) - mf2->fd = open(file.name, O_RDONLY); - file.fd = mf2->fd; - if (file.fd == -1) - Fatal(ctx) << "cannot open " << file.name << ": " << errno_string(); - - if (mf->parent) - obj->archive_name = mf->parent->name; - - file.offset = mf->get_offset(); - file.filesize = mf->size; + PluginInputFile file = create_plugin_input_file(ctx, mf); file.handle = (void *)obj; LOG << "read_lto_symbols: "<< mf->name << "\n"; @@ -635,48 +641,42 @@ ObjectFile *read_lto_object(Context &ctx, MappedFile> *mf) { // LLVM needs it and takes the ownership of fd. To prevent "too many // open files" issue, we close fd only for GCC. This is ugly, though. if (!is_llvm(ctx)) { - close(mf2->fd); - mf2->fd = -1; + MappedFile *mf2 = mf->parent ? mf->parent : mf; + mf2->close_fd(); } - // Initialize object symbols - std::vector> *esyms = new std::vector>(1); - obj->has_symver.resize(plugin_symbols.size()); - obj->lto_symbol_versions.resize(plugin_symbols.size()); + // Create a symbol strtab + i64 strtab_size = 1; + for (PluginSymbol &psym : plugin_symbols) + strtab_size += strlen(psym.name) + 1; + std::string strtab(strtab_size, '\0'); + + // Initialize esyms + obj->lto_elf_syms.resize(plugin_symbols.size() + 1); + i64 strtab_offset = 1; for (i64 i = 0; i < plugin_symbols.size(); i++) { PluginSymbol &psym = plugin_symbols[i]; - esyms->push_back(to_elf_sym(psym)); + obj->lto_elf_syms[i + 1] = to_elf_sym(psym); + obj->lto_elf_syms[i + 1].st_name = strtab_offset; - std::string_view key = save_string(ctx, psym.name); - std::string_view name = key; - - // Parse symbol version after atsign - if (i64 pos = name.find('@'); pos != name.npos) { - std::string_view ver = name.substr(pos); - name = name.substr(0, pos); - - if (ver != "@" && ver != "@@") { - if (ver.starts_with("@@")) - key = name; - obj->has_symver.set(i); - obj->lto_symbol_versions[i] = ver.substr(1); - } - } - - obj->symbols.push_back(get_symbol(ctx, key, name)); + i64 len = strlen(psym.name); + memcpy(strtab.data() + strtab_offset, psym.name, len); + strtab_offset += len + 1; } - obj->elf_syms = *esyms; - obj->has_symver.resize(esyms->size()); + obj->symbol_strtab = save_string(ctx, strtab); + obj->elf_syms = obj->lto_elf_syms; + obj->initialize_symbols(ctx); plugin_symbols.clear(); return obj; } // Entry point template -std::vector *> do_lto(Context &ctx) { - Timer t(ctx, "do_lto"); +std::vector *> run_lto_plugin(Context &ctx) { + Timer t(ctx, "run_lto_plugin"); + load_lto_plugin(ctx); if (!ctx.arg.lto_pass2 && !supports_v3_api(ctx)) restart_process(ctx); @@ -713,6 +713,18 @@ std::vector *> do_lto(Context &ctx) { for (Symbol *sym : ctx.arg.undefined) sym->referenced_by_regular_obj = true; + // Object files containing .gnu.offload_lto_.* sections need to be + // given to the LTO backend. Such sections contains code and data for + // peripherails (typically GPUs). + for (ObjectFile *file : ctx.objs) { + if (file->is_alive && !file->is_lto_obj && file->is_gcc_offload_obj) { + PluginInputFile pfile = create_plugin_input_file(ctx, file->mf); + int claimed = false; + claim_file_hook(&pfile, &claimed); + assert(!claimed); + } + } + // all_symbols_read_hook() calls add_input_file() and add_input_library() LOG << "all symbols read\n"; if (PluginStatus st = all_symbols_read_hook(); st != LDPS_OK) @@ -724,15 +736,14 @@ std::vector *> do_lto(Context &ctx) { template void lto_cleanup(Context &ctx) { Timer t(ctx, "lto_cleanup"); - if (cleanup_hook) cleanup_hook(); } using E = MOLD_TARGET; -template ObjectFile *read_lto_object(Context &, MappedFile> *); -template std::vector *> do_lto(Context &); +template ObjectFile *read_lto_object(Context &, MappedFile *); +template std::vector *> run_lto_plugin(Context &); template void lto_cleanup(Context &); -} // namespace mold::elf +} // namespace mold diff --git a/elf/lto-win32.cc b/src/lto-win32.cc similarity index 57% rename from elf/lto-win32.cc rename to src/lto-win32.cc index d55ba3a1..f5d17eec 100644 --- a/elf/lto-win32.cc +++ b/src/lto-win32.cc @@ -1,15 +1,15 @@ #include "mold.h" #include "lto.h" -namespace mold::elf { +namespace mold { template -ObjectFile *read_lto_object(Context &ctx, MappedFile> *mf) { +ObjectFile *read_lto_object(Context &ctx, MappedFile *mf) { Fatal(ctx) << "LTO is not supported on Windows"; } template -std::vector *> do_lto(Context &ctx) { +std::vector *> run_lto_plugin(Context &ctx) { return {}; } @@ -18,8 +18,8 @@ void lto_cleanup(Context &ctx) {} using E = MOLD_TARGET; -template ObjectFile *read_lto_object(Context &, MappedFile> *); -template std::vector *> do_lto(Context &); +template ObjectFile *read_lto_object(Context &, MappedFile *); +template std::vector *> run_lto_plugin(Context &); template void lto_cleanup(Context &); -} // namespace mold::elf +} // namespace mold diff --git a/elf/lto.h b/src/lto.h similarity index 97% rename from elf/lto.h rename to src/lto.h index 489a06fa..f1795534 100644 --- a/elf/lto.h +++ b/src/lto.h @@ -1,6 +1,6 @@ #pragma once -#include "../common/integers.h" +#include "../lib/integers.h" namespace mold { @@ -73,7 +73,11 @@ enum PluginOutputFileType { struct PluginInputFile { const char *name; - i32 fd; +#if __MINGW32__ + HANDLE fd; +#else + int fd; +#endif u64 offset; u64 filesize; void *handle; diff --git a/elf/main.cc b/src/main.cc similarity index 66% rename from elf/main.cc rename to src/main.cc index 6eacf407..ce94043c 100644 --- a/elf/main.cc +++ b/src/main.cc @@ -1,7 +1,6 @@ #include "mold.h" -#include "../common/archive-file.h" -#include "../common/cmdline.h" -#include "../common/output-file.h" +#include "filetype.h" +#include "../lib/archive-file.h" #include #include @@ -17,17 +16,24 @@ #ifdef _WIN32 # include -# define _chdir chdir +# define chdir _chdir #else # include #endif -namespace mold::elf { +#ifdef MOLD_X86_64 +int main(int argc, char **argv) { + return mold::mold_main(argc, argv); +} +#endif + +namespace mold { // Read the beginning of a given file and returns its machine type // (e.g. EM_X86_64 or EM_386). template -std::string_view get_machine_type(Context &ctx, MappedFile> *mf) { +std::string_view +get_machine_type(Context &ctx, ReaderContext &rctx, MappedFile *mf) { auto get_elf_type = [&](u8 *buf) -> std::string_view { bool is_le = (((ElfEhdr *)buf)->e_ident[EI_DATA] == ELFDATA2LSB); bool is_64; @@ -68,8 +74,6 @@ std::string_view get_machine_type(Context &ctx, MappedFile> *mf) { return M68K::target_name; case EM_SH: return SH4::target_name; - case EM_ALPHA: - return ALPHA::target_name; case EM_LOONGARCH: return is_64 ? LOONGARCH64::target_name : LOONGARCH32::target_name; default: @@ -83,17 +87,19 @@ std::string_view get_machine_type(Context &ctx, MappedFile> *mf) { case FileType::GCC_LTO_OBJ: return get_elf_type(mf->data); case FileType::AR: - for (MappedFile> *child : read_fat_archive_members(ctx, mf)) - if (get_file_type(ctx, child) == FileType::ELF_OBJ) + for (MappedFile *child : read_fat_archive_members(ctx, mf)) + if (FileType ty = get_file_type(ctx, child); + ty == FileType::ELF_OBJ || ty == FileType::GCC_LTO_OBJ) return get_elf_type(child->data); return ""; case FileType::THIN_AR: - for (MappedFile> *child : read_thin_archive_members(ctx, mf)) - if (get_file_type(ctx, child) == FileType::ELF_OBJ) + for (MappedFile *child : read_thin_archive_members(ctx, mf)) + if (FileType ty = get_file_type(ctx, child); + ty == FileType::ELF_OBJ || ty == FileType::GCC_LTO_OBJ) return get_elf_type(child->data); return ""; case FileType::TEXT: - return get_script_output_type(ctx, mf); + return Script(ctx, rctx, mf).get_script_output_type(); default: return ""; } @@ -101,33 +107,33 @@ std::string_view get_machine_type(Context &ctx, MappedFile> *mf) { template static void -check_file_compatibility(Context &ctx, MappedFile> *mf) { - std::string_view target = get_machine_type(ctx, mf); +check_file_compatibility(Context &ctx, ReaderContext &rctx, MappedFile *mf) { + std::string_view target = get_machine_type(ctx, rctx, mf); if (target != ctx.arg.emulation) Fatal(ctx) << mf->name << ": incompatible file type: " << ctx.arg.emulation << " is expected but got " << target; } template -static ObjectFile *new_object_file(Context &ctx, MappedFile> *mf, - std::string archive_name) { +static ObjectFile *new_object_file(Context &ctx, ReaderContext &rctx, + MappedFile *mf, std::string archive_name) { static Counter count("parsed_objs"); count++; - check_file_compatibility(ctx, mf); + check_file_compatibility(ctx, rctx, mf); - bool in_lib = ctx.in_lib || (!archive_name.empty() && !ctx.whole_archive); + bool in_lib = rctx.in_lib || (!archive_name.empty() && !rctx.whole_archive); ObjectFile *file = ObjectFile::create(ctx, mf, archive_name, in_lib); file->priority = ctx.file_priority++; - ctx.tg.run([file, &ctx] { file->parse(ctx); }); + rctx.tg->run([file, &ctx] { file->parse(ctx); }); if (ctx.arg.trace) - SyncOut(ctx) << "trace: " << *file; + Out(ctx) << "trace: " << *file; return file; } template -static ObjectFile *new_lto_obj(Context &ctx, MappedFile> *mf, - std::string archive_name) { +static ObjectFile *new_lto_obj(Context &ctx, ReaderContext &rctx, + MappedFile *mf, std::string archive_name) { static Counter count("parsed_lto_objs"); count++; @@ -137,50 +143,46 @@ static ObjectFile *new_lto_obj(Context &ctx, MappedFile> *mf, ObjectFile *file = read_lto_object(ctx, mf); file->priority = ctx.file_priority++; file->archive_name = archive_name; - file->is_in_lib = ctx.in_lib || (!archive_name.empty() && !ctx.whole_archive); + file->is_in_lib = rctx.in_lib || (!archive_name.empty() && !rctx.whole_archive); file->is_alive = !file->is_in_lib; - ctx.has_lto_object = true; if (ctx.arg.trace) - SyncOut(ctx) << "trace: " << *file; + Out(ctx) << "trace: " << *file; return file; } template static SharedFile * -new_shared_file(Context &ctx, MappedFile> *mf) { - check_file_compatibility(ctx, mf); +new_shared_file(Context &ctx, ReaderContext &rctx, MappedFile *mf) { + check_file_compatibility(ctx, rctx, mf); SharedFile *file = SharedFile::create(ctx, mf); file->priority = ctx.file_priority++; - ctx.tg.run([file, &ctx] { file->parse(ctx); }); + file->is_alive = !rctx.as_needed; + rctx.tg->run([file, &ctx] { file->parse(ctx); }); if (ctx.arg.trace) - SyncOut(ctx) << "trace: " << *file; + Out(ctx) << "trace: " << *file; return file; } template -void read_file(Context &ctx, MappedFile> *mf) { - if (ctx.visited.contains(mf->name)) - return; - +void read_file(Context &ctx, ReaderContext &rctx, MappedFile *mf) { switch (get_file_type(ctx, mf)) { case FileType::ELF_OBJ: - ctx.objs.push_back(new_object_file(ctx, mf, "")); + ctx.objs.push_back(new_object_file(ctx, rctx, mf, "")); return; case FileType::ELF_DSO: - ctx.dsos.push_back(new_shared_file(ctx, mf)); - ctx.visited.insert(mf->name); + ctx.dsos.push_back(new_shared_file(ctx, rctx, mf)); return; case FileType::AR: case FileType::THIN_AR: - for (MappedFile> *child : read_archive_members(ctx, mf)) { + for (MappedFile *child : read_archive_members(ctx, mf)) { switch (get_file_type(ctx, child)) { case FileType::ELF_OBJ: - ctx.objs.push_back(new_object_file(ctx, child, mf->name)); + ctx.objs.push_back(new_object_file(ctx, rctx, child, mf->name)); break; case FileType::GCC_LTO_OBJ: case FileType::LLVM_BITCODE: - if (ObjectFile *file = new_lto_obj(ctx, child, mf->name)) + if (ObjectFile *file = new_lto_obj(ctx, rctx, child, mf->name)) ctx.objs.push_back(file); break; case FileType::ELF_DSO: @@ -191,15 +193,13 @@ void read_file(Context &ctx, MappedFile> *mf) { break; } } - if (!ctx.whole_archive) - ctx.visited.insert(mf->name); return; case FileType::TEXT: - parse_linker_script(ctx, mf); + Script(ctx, rctx, mf).parse_linker_script(); return; case FileType::GCC_LTO_OBJ: case FileType::LLVM_BITCODE: - if (ObjectFile *file = new_lto_obj(ctx, mf, "")) + if (ObjectFile *file = new_lto_obj(ctx, rctx, mf, "")) ctx.objs.push_back(file); return; default: @@ -209,36 +209,60 @@ void read_file(Context &ctx, MappedFile> *mf) { template static std::string_view -deduce_machine_type(Context &ctx, std::span args) { - for (std::string_view arg : args) - if (!arg.starts_with('-')) - if (auto *mf = MappedFile>::open(ctx, std::string(arg))) - if (std::string_view target = get_machine_type(ctx, mf); - !target.empty()) - return target; +detect_machine_type(Context &ctx, std::vector args) { + for (ReaderContext rctx; const std::string &arg : args) { + if (arg == "--Bstatic") { + rctx.static_ = true; + } else if (arg == "--Bdynamic") { + rctx.static_ = false; + } else if (!arg.starts_with('-')) { + if (MappedFile *mf = open_file(ctx, arg)) + if (get_file_type(ctx, mf) != FileType::TEXT) + if (std::string_view target = get_machine_type(ctx, rctx, mf); + !target.empty()) + return target; + } + } + + for (ReaderContext rctx; const std::string &arg : args) { + if (arg == "--Bstatic") { + rctx.static_ = true; + } else if (arg == "--Bdynamic") { + rctx.static_ = false; + } else if (!arg.starts_with('-')) { + if (MappedFile *mf = open_file(ctx, arg)) + if (get_file_type(ctx, mf) == FileType::TEXT) + if (std::string_view target = + Script(ctx, rctx, mf).get_script_output_type(); + !target.empty()) + return target; + } + } + Fatal(ctx) << "-m option is missing"; } template -MappedFile> *open_library(Context &ctx, std::string path) { - MappedFile> *mf = MappedFile>::open(ctx, path); +MappedFile *open_library(Context &ctx, ReaderContext &rctx, std::string path) { + MappedFile *mf = open_file(ctx, path); if (!mf) return nullptr; - std::string_view target = get_machine_type(ctx, mf); - if (target.empty() || target == E::target_name) - return mf; - Warn(ctx) << path << ": skipping incompatible file " << target - << " " << (int)E::e_machine; - return nullptr; + std::string_view target = get_machine_type(ctx, rctx, mf); + if (!target.empty() && target != E::target_name) { + Warn(ctx) << path << ": skipping incompatible file: " << target + << " (e_machine " << (int)E::e_machine << ")"; + return nullptr; + } + return mf; } template -MappedFile> *find_library(Context &ctx, std::string name) { +MappedFile *find_library(Context &ctx, ReaderContext &rctx, std::string name) { if (name.starts_with(':')) { for (std::string_view dir : ctx.arg.library_paths) { std::string path = std::string(dir) + "/" + name.substr(1); - if (MappedFile> *mf = open_library(ctx, path)) + if (MappedFile *mf = open_library(ctx, rctx, path)) return mf; } Fatal(ctx) << "library not found: " << name; @@ -246,94 +270,88 @@ MappedFile> *find_library(Context &ctx, std::string name) { for (std::string_view dir : ctx.arg.library_paths) { std::string stem = std::string(dir) + "/lib" + name; - if (!ctx.is_static) - if (MappedFile> *mf = open_library(ctx, stem + ".so")) + if (!rctx.static_) + if (MappedFile *mf = open_library(ctx, rctx, stem + ".so")) return mf; - if (MappedFile> *mf = open_library(ctx, stem + ".a")) + if (MappedFile *mf = open_library(ctx, rctx, stem + ".a")) return mf; } Fatal(ctx) << "library not found: " << name; } -template -MappedFile> *find_from_search_paths(Context &ctx, std::string name) { - if (MappedFile> *mf = MappedFile>::open(ctx, name)) - return mf; - - for (std::string_view dir : ctx.arg.library_paths) - if (MappedFile> *mf = - MappedFile>::open(ctx, std::string(dir) + "/" + name)) - return mf; - return nullptr; -} - template static void read_input_files(Context &ctx, std::span args) { Timer t(ctx, "read_input_files"); - std::vector> state; - ctx.is_static = ctx.arg.is_static; + ReaderContext rctx; + std::vector stack; + std::unordered_set visited; + + tbb::task_group tg; + rctx.tg = &tg; while (!args.empty()) { std::string_view arg = args[0]; args = args.subspan(1); if (arg == "--as-needed") { - ctx.as_needed = true; + rctx.as_needed = true; } else if (arg == "--no-as-needed") { - ctx.as_needed = false; + rctx.as_needed = false; } else if (arg == "--whole-archive") { - ctx.whole_archive = true; + rctx.whole_archive = true; } else if (arg == "--no-whole-archive") { - ctx.whole_archive = false; + rctx.whole_archive = false; } else if (arg == "--Bstatic") { - ctx.is_static = true; + rctx.static_ = true; } else if (arg == "--Bdynamic") { - ctx.is_static = false; + rctx.static_ = false; } else if (arg == "--start-lib") { - ctx.in_lib = true; + rctx.in_lib = true; } else if (arg == "--end-lib") { - ctx.in_lib = false; - } else if (remove_prefix(arg, "--version-script=")) { - MappedFile> *mf = find_from_search_paths(ctx, std::string(arg)); - if (!mf) - Fatal(ctx) << "--version-script: file not found: " << arg; - parse_version_script(ctx, mf); + rctx.in_lib = false; } else if (arg == "--push-state") { - state.push_back({ctx.as_needed, ctx.whole_archive, ctx.is_static, - ctx.in_lib}); + stack.push_back(rctx); } else if (arg == "--pop-state") { - if (state.empty()) + if (stack.empty()) Fatal(ctx) << "no state pushed before popping"; - std::tie(ctx.as_needed, ctx.whole_archive, ctx.is_static, ctx.in_lib) = - state.back(); - state.pop_back(); - } else if (remove_prefix(arg, "-l")) { - MappedFile> *mf = find_library(ctx, std::string(arg)); + rctx = stack.back(); + stack.pop_back(); + } else if (arg.starts_with("-l")) { + arg = arg.substr(2); + if (visited.contains(arg)) + continue; + visited.insert(arg); + + MappedFile *mf = find_library(ctx, rctx, std::string(arg)); mf->given_fullpath = false; - read_file(ctx, mf); + read_file(ctx, rctx, mf); } else { - read_file(ctx, MappedFile>::must_open(ctx, std::string(arg))); + read_file(ctx, rctx, must_open_file(ctx, std::string(arg))); } } if (ctx.objs.empty()) Fatal(ctx) << "no input files"; - ctx.tg.wait(); + tg.wait(); +} + +template +static bool has_lto_obj(Context &ctx) { + for (ObjectFile *file : ctx.objs) + if (file->is_alive && (file->is_lto_obj || file->is_gcc_offload_obj)) + return true; + return false; } template -int elf_main(int argc, char **argv) { +int mold_main(int argc, char **argv) { Context ctx; // Process -run option first. process_run_subcommand() does not return. - if (argc >= 2 && (argv[1] == "-run"sv || argv[1] == "--run"sv)) { -#if defined(_WIN32) || defined(__APPLE__) - Fatal(ctx) << "-run is supported only on Unix"; -#endif + if (argc >= 2 && (argv[1] == "-run"sv || argv[1] == "--run"sv)) process_run_subcommand(ctx, argc, argv); - } // Parse non-positional command line options ctx.cmdline_args = expand_response_files(ctx, argv); @@ -341,7 +359,7 @@ int elf_main(int argc, char **argv) { // If no -m option is given, deduce it from input files. if (ctx.arg.emulation.empty()) - ctx.arg.emulation = deduce_machine_type(ctx, file_args); + ctx.arg.emulation = detect_machine_type(ctx, file_args); // Redo if -m is not x86-64. if constexpr (is_x86_64) @@ -358,14 +376,10 @@ int elf_main(int argc, char **argv) { << ": " << errno_string(); // Fork a subprocess unless --no-fork is given. - std::function on_complete; - -#if !defined(_WIN32) && !defined(__APPLE__) if (ctx.arg.fork) - on_complete = fork_child(); -#endif + fork_child(); - acquire_global_lock(ctx); + acquire_global_lock(); tbb::global_control tbb_cont(tbb::global_control::max_allowed_parallelism, ctx.arg.thread_count); @@ -376,8 +390,8 @@ int elf_main(int argc, char **argv) { // Handle --retain-symbols-file options if any. if (ctx.arg.retain_symbols_file) - for (std::string_view name : *ctx.arg.retain_symbols_file) - get_symbol(ctx, name)->write_to_symtab = true; + for (Symbol *sym : *ctx.arg.retain_symbols_file) + sym->write_to_symtab = true; for (std::string_view arg : ctx.arg.trace_symbol) get_symbol(ctx, arg)->is_traced = true; @@ -402,23 +416,26 @@ int elf_main(int argc, char **argv) { if (!ctx.arg.relocatable) create_internal_file(ctx); - // resolve_symbols is 4 things in 1 phase: - // - // - Determine the set of object files to extract from archives. - // - Remove redundant COMDAT sections (e.g. duplicate inline functions). - // - Finally, the actual symbol resolution. - // - LTO, which requires preliminary symbol resolution before running - // and a follow-up re-resolution after the LTO objects are emitted. - // - // These passes have complex interactions, and unfortunately has to be - // put together in a single phase. + // Resolve symbols by choosing the most appropriate file for each + // symbol. This pass also removes redundant comdat sections (e.g. + // duplicate inline functions). resolve_symbols(ctx); - // "Kill" .eh_frame input sections after symbol resolution. - kill_eh_frame_sections(ctx); + // If there's an object file compiled with -flto, do link-time + // optimization. + if (has_lto_obj(ctx)) + do_lto(ctx); + + // Now that we know which object files are to be included to the + // final output, we can remove unnecessary files. + std::erase_if(ctx.objs, [](InputFile *file) { return !file->is_alive; }); + std::erase_if(ctx.dsos, [](InputFile *file) { return !file->is_alive; }); + + // Parse .eh_frame section contents. + parse_eh_frame_sections(ctx); - // Resolve mergeable section pieces to merge them. - resolve_section_pieces(ctx); + // Split mergeable section contents into section pieces. + create_merged_sections(ctx); // Handle --relocatable. Since the linker's behavior is quite different // from the normal one when the option is given, the logic is implemented @@ -441,7 +458,7 @@ int elf_main(int argc, char **argv) { compute_import_export(ctx); // Set "address-taken" bits for input sections. - if (ctx.arg.icf || ctx.arg.z_rewrite_endbr) + if (ctx.arg.icf) compute_address_significance(ctx); // Garbage-collect unreachable sections. @@ -452,9 +469,6 @@ int elf_main(int argc, char **argv) { if (ctx.arg.icf) icf_sections(ctx); - // Compute sizes of sections containing mergeable strings. - compute_merged_section_sizes(ctx); - // Create linker-synthesized sections such as .got or .plt. create_synthetic_sections(ctx); @@ -462,6 +476,9 @@ int elf_main(int argc, char **argv) { if (!ctx.arg.allow_multiple_definition) check_duplicate_symbols(ctx); + if (!ctx.arg.allow_shlib_undefined) + check_shlib_undefined(ctx); + // Warn if symbols with different types are defined under the same name. check_symbol_types(ctx); @@ -471,6 +488,14 @@ int elf_main(int argc, char **argv) { // Bin input sections into output sections. create_output_sections(ctx); + // Convert an .ARM.exidx to a synthetic section. + if constexpr (is_arm32) + create_arm_exidx_section(ctx); + + // Handle --section-align options. + if (!ctx.arg.section_align.empty()) + apply_section_align(ctx); + // Add synthetic symbols such as __ehdr_start or __end. add_synthetic_symbols(ctx); @@ -521,8 +546,7 @@ int elf_main(int argc, char **argv) { // If .ctors/.dtors are to be placed to .init_array/.fini_array, // we need to reverse their contents. - if (ctx.has_init_array && ctx.has_ctors) - fixup_ctors_in_init_array(ctx); + fixup_ctors_in_init_array(ctx); // Handle --shuffle-sections if (ctx.arg.shuffle_sections != SHUFFLE_SECTIONS_NONE) @@ -547,14 +571,20 @@ int elf_main(int argc, char **argv) { // .got.plt, .dynsym, .dynstr, etc. scan_relocations(ctx); - // Compute sizes of output sections while assigning offsets - // within an output section to input sections. - compute_section_sizes(ctx); + // Compute the is_weak bit for each imported symbol. + compute_imported_symbol_weakness(ctx); // Sort sections by section attributes so that we'll have to // create as few segments as possible. sort_output_sections(ctx); + if (!ctx.arg.separate_debug_file.empty()) + separate_debug_sections(ctx); + + // Compute sizes of output sections while assigning offsets + // within an output section to input sections. + compute_section_sizes(ctx); + // If --packed_dyn_relocs=relr was given, base relocations are stored // to a .relr.dyn section in a compressed form. Construct a compressed // relocations now so that we can fix section sizes and file layout. @@ -564,7 +594,7 @@ int elf_main(int argc, char **argv) { // Reserve a space for dynamic symbol strings in .dynstr and sort // .dynsym contents if necessary. Beyond this point, no symbol will // be added to .dynsym. - ctx.dynsym->finalize(ctx); + sort_dynsyms(ctx); // Print reports about undefined symbols, if needed. if (ctx.arg.unresolved_symbols == UNRESOLVED_ERROR) @@ -578,7 +608,8 @@ int elf_main(int argc, char **argv) { ctx.verneed->construct(ctx); // Compute .symtab and .strtab sizes for each file. - create_output_symtab(ctx); + if (!ctx.arg.strip_all) + create_output_symtab(ctx); // .eh_frame is a special section from the linker's point of view, // as its contents are parsed and reconstructed by the linker, @@ -601,8 +632,17 @@ int elf_main(int argc, char **argv) { // that they can jump to anywhere in ±2 GiB by default. They may // be replaced with shorter instruction sequences if destinations // are close enough. Do this optimization. - if constexpr (is_riscv) - filesize = riscv_resize_sections(ctx); + if constexpr (is_riscv || is_loongarch) { + shrink_sections(ctx); + filesize = set_osec_offsets(ctx); + } + + if constexpr (is_arm32) { + if (ctx.extra.exidx) { + ctx.extra.exidx->remove_duplicate_entries(ctx); + filesize = set_osec_offsets(ctx); + } + } // At this point, memory layout is fixed. @@ -614,16 +654,17 @@ int elf_main(int argc, char **argv) { // If --compress-debug-sections is given, compress .debug_* sections // using zlib. - if (ctx.arg.compress_debug_sections != COMPRESS_NONE) - filesize = compress_debug_sections(ctx); + if (ctx.arg.compress_debug_sections != COMPRESS_NONE) { + compress_debug_sections(ctx); + filesize = set_osec_offsets(ctx); + } // At this point, both memory and file layouts are fixed. t_before_copy.stop(); // Create an output file - ctx.output_file = - OutputFile>::open(ctx, ctx.arg.output, filesize, 0777); + ctx.output_file = OutputFile::open(ctx, ctx.arg.output, filesize, 0777); ctx.buf = ctx.output_file->buf; Timer t_copy(ctx, "copy"); @@ -631,24 +672,28 @@ int elf_main(int argc, char **argv) { // Copy input sections to the output file and apply relocations. copy_chunks(ctx); + if constexpr (is_x86_64) + if (ctx.arg.z_rewrite_endbr) + rewrite_endbr(ctx); + // Dynamic linker works better with sorted .rela.dyn section, // so we sort them. ctx.reldyn->sort(ctx); - // Zero-clear paddings between sections - clear_padding(ctx); + // .note.gnu.build-id section contains a cryptographic hash of the + // entire output file. Now that we wrote everything except build-id, + // we can compute it. + if (ctx.buildid) + write_build_id(ctx); // .gdb_index's contents cannot be constructed before applying // relocations to other debug sections. We have relocated debug // sections now, so write the .gdb_index section. - if (ctx.gdb_index) + if (ctx.gdb_index && ctx.arg.separate_debug_file.empty()) write_gdb_index(ctx); - // .note.gnu.build-id section contains a cryptographic hash of the - // entire output file. Now that we wrote everything except build-id, - // we can compute it. - if (ctx.buildid) - ctx.buildid->write_buildid(ctx); + if (!ctx.arg.separate_debug_file.empty()) + write_gnu_debuglink(ctx); t_copy.stop(); ctx.checkpoint(); @@ -660,7 +705,7 @@ int elf_main(int argc, char **argv) { if (!ctx.arg.dependency_file.empty()) write_dependency_file(ctx); - if (ctx.has_lto_object) + if (!ctx.arg.plugin.empty()) lto_cleanup(ctx); t_all.stop(); @@ -668,6 +713,9 @@ int elf_main(int argc, char **argv) { if (ctx.arg.print_map) print_map(ctx); + if (!ctx.arg.separate_debug_file.empty()) + write_separate_debug_file(ctx); + // Show stats numbers if (ctx.arg.stats) show_stats(ctx); @@ -677,10 +725,9 @@ int elf_main(int argc, char **argv) { std::cout << std::flush; std::cerr << std::flush; - if (on_complete) - on_complete(); - release_global_lock(ctx); + notify_parent(); + release_global_lock(); if (ctx.arg.quick_exit) _exit(0); @@ -691,14 +738,8 @@ int elf_main(int argc, char **argv) { return 0; } -#ifdef MOLD_X86_64 -int main(int argc, char **argv) { - return elf_main(argc, argv); -} -#endif - using E = MOLD_TARGET; -template int elf_main(int, char **); +template int mold_main(int, char **); -} // namespace mold::elf +} // namespace mold diff --git a/elf/mapfile.cc b/src/mapfile.cc similarity index 96% rename from elf/mapfile.cc rename to src/mapfile.cc index 5975b106..8d60971b 100644 --- a/elf/mapfile.cc +++ b/src/mapfile.cc @@ -7,7 +7,7 @@ #include #include -namespace mold::elf { +namespace mold { template using Map = @@ -54,6 +54,8 @@ static Map get_map(Context &ctx) { template void print_map(Context &ctx) { + Timer t(ctx, "print_map"); + std::ostream *out = &std::cout; std::unique_ptr file; @@ -75,7 +77,7 @@ void print_map(Context &ctx) { << std::setw(6) << (u64)osec->shdr.sh_addralign << " " << osec->name << "\n"; - if (osec->kind() != OUTPUT_SECTION) + if (!osec->to_osec()) continue; std::span *> members = ((OutputSection *)osec)->members; @@ -84,7 +86,6 @@ void print_map(Context &ctx) { tbb::parallel_for((i64)0, (i64)members.size(), [&](i64 i) { InputSection *mem = members[i]; std::ostringstream ss; - opt_demangle = ctx.arg.demangle; u64 addr = osec->shdr.sh_addr + mem->offset; ss << std::showbase @@ -113,4 +114,4 @@ using E = MOLD_TARGET; template void print_map(Context &ctx); -} // namespace mold::elf +} // namespace mold diff --git a/elf/mold-wrapper.c b/src/mold-wrapper.c similarity index 87% rename from elf/mold-wrapper.c rename to src/mold-wrapper.c index 35e55c6e..d63d4de1 100644 --- a/elf/mold-wrapper.c +++ b/src/mold-wrapper.c @@ -1,8 +1,5 @@ #define _GNU_SOURCE 1 -#if !defined(__OpenBSD__) && !defined(__FreeBSD__) -# include -#endif #include #include #include @@ -12,6 +9,10 @@ #include #include +#if __has_include() +# include +#endif + extern char **environ; static char *get_mold_path() { @@ -132,3 +133,14 @@ int posix_spawn(pid_t *pid, const char *path, typeof(posix_spawn) *real = dlsym(RTLD_NEXT, "posix_spawn"); return real(pid, path, file_actions, attrp, argv, envp); } + +int posix_spawnp(pid_t *pid, const char *file, + const posix_spawn_file_actions_t *file_actions, + const posix_spawnattr_t *attrp, + char *const *argv, char *const *envp) { + debug_print("posix_spawnp %s\n", file); + if (is_ld(file)) + file = get_mold_path(); + typeof(posix_spawnp) *real = dlsym(RTLD_NEXT, "posix_spawnp"); + return real(pid, file, file_actions, attrp, argv, envp); +} diff --git a/elf/mold.h b/src/mold.h similarity index 82% rename from elf/mold.h rename to src/mold.h index ecafca93..322a0ea8 100644 --- a/elf/mold.h +++ b/src/mold.h @@ -1,7 +1,7 @@ #pragma once +#include "../lib/common.h" #include "elf.h" -#include "../common/common.h" #include #include @@ -34,7 +34,7 @@ # include #endif -namespace mold::elf { +namespace mold { template class InputFile; template class InputSection; @@ -47,21 +47,26 @@ template class Symbol; template struct CieRecord; template struct Context; template struct FdeRecord; +template class MergeableSection; template class RelocSection; template std::ostream &operator<<(std::ostream &out, const Symbol &sym); +std::string get_mold_version(); + // // Mergeable section fragments // template -struct SectionFragment { +struct __attribute__((aligned(4))) SectionFragment { SectionFragment(MergedSection *sec, bool is_alive) : output_section(*sec), is_alive(is_alive) {} - u64 get_addr(Context &ctx) const; + u64 get_addr(Context &ctx) const { + return output_section.shdr.sh_addr + offset; + } MergedSection &output_section; u32 offset = -1; @@ -95,12 +100,12 @@ struct SymbolAux : SymbolAux { // template -class RangeExtensionThunk {}; +class Thunk {}; template -class RangeExtensionThunk { +class Thunk { public: - RangeExtensionThunk(OutputSection &osec, i64 offset) + Thunk(OutputSection &osec, i64 offset) : output_section(osec), offset(offset) {} i64 size() const { return E::thunk_hdr_size + symbols.size() * E::thunk_size; } @@ -119,9 +124,11 @@ class RangeExtensionThunk { std::vector *> symbols; }; -struct RangeExtensionRef { - i16 thunk_idx = -1; - i16 sym_idx = -1; +struct ThunkRef { + static constexpr i64 MAX_SYM_IDX = (1 << 17) - 1; + + i32 thunk_idx : 14 = -1; + i32 sym_idx : 18 = -1; }; // @@ -171,14 +178,13 @@ struct CieRecord { } std::span> get_rels() const { - i64 end = rel_idx; - while (end < rels.size() && rels[end].r_offset < input_offset + size()) - end++; - return rels.subspan(rel_idx, end - rel_idx); + i64 end = input_offset + size(); + i64 i = rel_idx; + while (i < rels.size() && rels[i].r_offset < end) + i++; + return rels.subspan(rel_idx, i - rel_idx); } - bool equals(const CieRecord &other) const; - ObjectFile &file; InputSection &input_section; u32 input_offset = -1; @@ -190,14 +196,30 @@ struct CieRecord { std::string_view contents; }; +template +bool cie_equals(const CieRecord &a, const CieRecord &b); + template struct FdeRecord { FdeRecord(u32 input_offset, u32 rel_idx) : input_offset(input_offset), rel_idx(rel_idx) {} - i64 size(ObjectFile &file) const; - std::string_view get_contents(ObjectFile &file) const; - std::span> get_rels(ObjectFile &file) const; + i64 size(ObjectFile &file) const { + return *(U32 *)(file.cies[cie_idx].contents.data() + input_offset) + 4; + } + + std::string_view get_contents(ObjectFile &file) const { + return file.cies[cie_idx].contents.substr(input_offset, size(file)); + } + + std::span> get_rels(ObjectFile &file) const { + std::span> rels = file.cies[cie_idx].rels; + i64 end = input_offset + size(file); + i64 i = rel_idx; + while (i < rels.size() && rels[i].r_offset < end) + i++; + return rels.subspan(rel_idx, i - rel_idx); + } u32 input_offset = -1; u32 output_offset = -1; @@ -212,22 +234,22 @@ struct InputSectionExtras {}; template struct InputSectionExtras { - std::vector range_extn; + std::vector thunk_refs; }; -template +template requires is_riscv || is_loongarch struct InputSectionExtras { std::vector r_deltas; }; // InputSection represents a section in an input object file. template -class InputSection { +class __attribute__((aligned(4))) InputSection { public: InputSection(Context &ctx, ObjectFile &file, i64 shndx); void uncompress(Context &ctx); - void uncompress_to(Context &ctx, u8 *buf); + void copy_contents(Context &ctx, u8 *buf); void scan_relocations(Context &ctx); void write_to(Context &ctx, u8 *buf); void apply_reloc_alloc(Context &ctx, u8 *base); @@ -242,13 +264,15 @@ class InputSection { std::span> get_fdes() const; std::string_view get_func_name(Context &ctx, i64 offset) const; bool is_relr_reloc(Context &ctx, const ElfRel &rel) const; - bool is_killed_by_icf() const; - + bool icf_removed() const; bool record_undef_error(Context &ctx, const ElfRel &rel); + std::pair *, i64> + get_fragment(Context &ctx, const ElfRel &rel); + ObjectFile &file; OutputSection *output_section = nullptr; - u64 sh_size = -1; + i64 sh_size = -1; std::string_view contents; @@ -257,10 +281,10 @@ class InputSection { i32 fde_begin = -1; i32 fde_end = -1; - u64 offset = -1; - u32 shndx = -1; - u32 relsec_idx = -1; - u32 reldyn_offset = 0; + i64 offset = -1; + i32 shndx = -1; + i32 relsec_idx = -1; + i32 reldyn_offset = 0; bool uncompressed = false; @@ -282,15 +306,13 @@ class InputSection { // - `leader == this`: This section was retained. // - `leader != this`: This section was merged with another identical section. InputSection *leader = nullptr; - u32 icf_idx = -1; + i32 icf_idx = -1; bool icf_eligible = false; bool icf_leaf = false; private: void scan_pcrel(Context &ctx, Symbol &sym, const ElfRel &rel); void scan_absrel(Context &ctx, Symbol &sym, const ElfRel &rel); - void scan_dyn_absrel(Context &ctx, Symbol &sym, const ElfRel &rel); - void scan_toc_rel(Context &ctx, Symbol &sym, const ElfRel &rel); void scan_tlsdesc(Context &ctx, Symbol &sym); void check_tlsle(Context &ctx, Symbol &sym, const ElfRel &rel); @@ -300,11 +322,6 @@ class InputSection { void apply_toc_rel(Context &ctx, Symbol &sym, const ElfRel &rel, u8 *loc, u64 S, i64 A, u64 P, ElfRel **dynrel); - void copy_contents_riscv(Context &ctx, u8 *buf); - - std::pair *, i64> - get_fragment(Context &ctx, const ElfRel &rel); - u64 get_thunk_addr(i64 idx); std::optional get_tombstone(Symbol &sym, SectionFragment *frag); @@ -314,22 +331,26 @@ class InputSection { // tls.cc // -template u64 get_tls_begin(Context &); -template u64 get_tp_addr(Context &); -template u64 get_dtp_addr(Context &); +template u64 get_tp_addr(const ElfPhdr &); +template u64 get_dtp_addr(const ElfPhdr &); // // output-chunks.cc // template -u64 get_eflags(Context &ctx); +Chunk *find_chunk(Context &ctx, u32 sh_type); template -i64 to_phdr_flags(Context &ctx, Chunk *chunk); +Chunk *find_chunk(Context &ctx, std::string_view name); + +template +u64 get_eflags(Context &ctx) { + return 0; +} template -std::string_view get_output_name(Context &ctx, std::string_view name, u64 flags); +i64 to_phdr_flags(Context &ctx, Chunk *chunk); template void write_plt_header(Context &ctx, u8 *buf); @@ -340,19 +361,18 @@ void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym); template void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym); -typedef enum { HEADER, OUTPUT_SECTION, SYNTHETIC } ChunkKind; - // Chunk represents a contiguous region in an output file. template -class Chunk { +class __attribute__((aligned(4))) Chunk { public: virtual ~Chunk() = default; - virtual ChunkKind kind() { return SYNTHETIC; } + virtual bool is_header() { return false; } virtual OutputSection *to_osec() { return nullptr; } + virtual void compute_section_size(Context &ctx) {} virtual i64 get_reldyn_size(Context &ctx) const { return 0; } virtual void construct_relr(Context &ctx) {} virtual void copy_buf(Context &ctx) {} - virtual void write_to(Context &ctx, u8 *buf) { unreachable(); } + virtual void write_to(Context &ctx, u8 *buf, ElfRel *rel) { unreachable(); } virtual void update_shdr(Context &ctx) {} std::string_view name; @@ -397,7 +417,7 @@ class OutputEhdr : public Chunk { this->shdr.sh_addralign = sizeof(Word); } - ChunkKind kind() override { return HEADER; } + bool is_header() override { return true; } void copy_buf(Context &ctx) override; }; @@ -411,7 +431,7 @@ class OutputShdr : public Chunk { this->shdr.sh_addralign = sizeof(Word); } - ChunkKind kind() override { return HEADER; } + bool is_header() override { return true; } void copy_buf(Context &ctx) override; }; @@ -425,7 +445,7 @@ class OutputPhdr : public Chunk { this->shdr.sh_addralign = sizeof(Word); } - ChunkKind kind() override { return HEADER; } + bool is_header() override { return true; } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; @@ -445,25 +465,51 @@ class InterpSection : public Chunk { void copy_buf(Context &ctx) override; }; +enum AbsRelKind { + ABS_REL_NONE, + ABS_REL_BASEREL, + ABS_REL_RELR, + ABS_REL_IFUNC, + ABS_REL_DYNREL, +}; + +// Represents a word-size absolute relocation (e.g. R_X86_64_64) +template +struct AbsRel { + InputSection *isec = nullptr; + u64 offset = 0; + Symbol *sym = nullptr; + i64 addend = 0; + AbsRelKind kind = ABS_REL_NONE; +}; + // Sections template class OutputSection : public Chunk { public: - OutputSection(Context &ctx, std::string_view name, u32 type, u64 flags); - ChunkKind kind() override { return OUTPUT_SECTION; } + OutputSection(std::string_view name, u32 type) { + this->name = name; + this->shdr.sh_type = type; + } + OutputSection *to_osec() override { return this; } + void compute_section_size(Context &ctx) override; + i64 get_reldyn_size(Context &ctx) const override; void construct_relr(Context &ctx) override; void copy_buf(Context &ctx) override; - void write_to(Context &ctx, u8 *buf) override; + void write_to(Context &ctx, u8 *buf, ElfRel *rel) override; void compute_symtab_size(Context &ctx) override; void populate_symtab(Context &ctx) override; + void scan_abs_relocations(Context &ctx); void create_range_extension_thunks(Context &ctx); std::vector *> members; - std::vector>> thunks; + std::vector>> thunks; std::unique_ptr> reloc_sec; + std::vector> abs_rels; + Atomic sh_flags; }; template @@ -501,7 +547,7 @@ class GotSection : public Chunk { std::vector *> tlsgd_syms; std::vector *> tlsdesc_syms; std::vector *> gottp_syms; - u32 tlsld_idx = -1; + i64 tlsld_idx = -1; }; template @@ -664,13 +710,19 @@ class DynstrSection : public Chunk { template class DynamicSection : public Chunk { public: - DynamicSection() { + DynamicSection(Context &ctx) { this->name = ".dynamic"; - this->is_relro = true; this->shdr.sh_type = SHT_DYNAMIC; - this->shdr.sh_flags = SHF_ALLOC | SHF_WRITE; this->shdr.sh_addralign = sizeof(Word); this->shdr.sh_entsize = sizeof(ElfDyn); + + if (ctx.arg.z_rodynamic) { + this->shdr.sh_flags = SHF_ALLOC; + this->is_relro = false; + } else { + this->shdr.sh_flags = SHF_ALLOC | SHF_WRITE; + this->is_relro = true; + } } void update_shdr(Context &ctx) override; @@ -718,12 +770,10 @@ class DynsymSection : public Chunk { } void add_symbol(Context &ctx, Symbol *sym); - void finalize(Context &ctx); void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; std::vector *> symbols; - bool finalized = false; }; template @@ -751,7 +801,6 @@ class GnuHashSection : public Chunk { this->shdr.sh_addralign = sizeof(Word); } - std::span *> get_exported_symbols(Context &ctx); void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; @@ -759,33 +808,37 @@ class GnuHashSection : public Chunk { static constexpr i64 HEADER_SIZE = 16; static constexpr i64 BLOOM_SHIFT = 26; - u32 num_buckets = -1; - u32 num_bloom = 1; + i64 num_buckets = -1; + i64 num_bloom = 1; + i64 num_exported = -1; }; template class MergedSection : public Chunk { public: static MergedSection * - get_instance(Context &ctx, std::string_view name, i64 type, i64 flags, - i64 entsize, i64 addralign); + get_instance(Context &ctx, std::string_view name, const ElfShdr &shdr); SectionFragment *insert(Context &ctx, std::string_view data, u64 hash, i64 p2align); - void assign_offsets(Context &ctx); + void resolve(Context &ctx); + void compute_section_size(Context &ctx) override; void copy_buf(Context &ctx) override; - void write_to(Context &ctx, u8 *buf) override; + void write_to(Context &ctx, u8 *buf, ElfRel *rel) override; void print_stats(Context &ctx); + std::vector *> members; + std::mutex mu; + + ConcurrentMap> map; HyperLogLog estimator; + bool resolved = false; private: MergedSection(std::string_view name, i64 flags, i64 type, i64 entsize); - ConcurrentMap> map; std::vector shard_offsets; - std::once_flag once_flag; }; template @@ -819,7 +872,7 @@ class EhFrameHdrSection : public Chunk { void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; - u32 num_fdes = 0; + i64 num_fdes = 0; }; template @@ -848,7 +901,6 @@ class CopyrelSection : public Chunk { } void add_symbol(Context &ctx, Symbol *sym); - void update_shdr(Context &ctx) override; i64 get_reldyn_size(Context &ctx) const override { return symbols.size(); } void copy_buf(Context &ctx) override; @@ -919,9 +971,8 @@ class BuildIdSection : public Chunk { void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; - void write_buildid(Context &ctx); - static constexpr i64 HEADER_SIZE = 16; + std::vector contents; }; template @@ -957,6 +1008,22 @@ class NotePropertySection : public Chunk { std::map properties; }; +template +class GnuDebuglinkSection : public Chunk { +public: + GnuDebuglinkSection() { + this->name = ".gnu_debuglink"; + this->shdr.sh_type = SHT_PROGBITS; + this->shdr.sh_addralign = 4; + } + + void update_shdr(Context &ctx) override; + void copy_buf(Context &ctx) override; + + std::string filename; + u32 crc32 = 0; +}; + template class GdbIndexSection : public Chunk { public: @@ -975,7 +1042,7 @@ class CompressedSection : public Chunk { private: ElfChdr chdr = {}; - std::unique_ptr compressed; + std::unique_ptr compressor; }; template @@ -1026,6 +1093,82 @@ class ComdatGroupSection : public Chunk { std::vector *> members; }; +// +// output-file.cc +// + +template +class OutputFile { +public: + static std::unique_ptr> + open(Context &ctx, std::string path, i64 filesize, int perm); + + virtual void close(Context &ctx) = 0; + virtual ~OutputFile() = default; + + u8 *buf = nullptr; + std::vector buf2; + std::string path; + int fd = -1; + i64 filesize = 0; + bool is_mmapped = false; + bool is_unmapped = false; + +protected: + OutputFile(std::string path, i64 filesize, bool is_mmapped) + : path(path), filesize(filesize), is_mmapped(is_mmapped) {} +}; + +template +class MallocOutputFile : public OutputFile { +public: + MallocOutputFile(Context &ctx, std::string path, i64 filesize, int perm) + : OutputFile(path, filesize, false), ptr(new u8[filesize]), + perm(perm) { + this->buf = ptr.get(); + } + + void close(Context &ctx) override { + Timer t(ctx, "close_file"); + FILE *fp; + + if (this->path == "-") { + fp = stdout; + } else { +#ifdef _WIN32 + int pmode = (perm & 0200) ? (_S_IREAD | _S_IWRITE) : _S_IREAD; + i64 fd = _open(this->path.c_str(), _O_RDWR | _O_CREAT | _O_BINARY, pmode); +#else + i64 fd = ::open(this->path.c_str(), O_RDWR | O_CREAT, perm); +#endif + if (fd == -1) + Fatal(ctx) << "cannot open " << this->path << ": " << errno_string(); +#ifdef _WIN32 + fp = _fdopen(fd, "wb"); +#else + fp = fdopen(fd, "w"); +#endif + } + + fwrite(this->buf, this->filesize, 1, fp); + if (!this->buf2.empty()) + fwrite(this->buf2.data(), this->buf2.size(), 1, fp); + fclose(fp); + } + +private: + std::unique_ptr ptr; + int perm; +}; + +template +class LockingOutputFile : public OutputFile { +public: + LockingOutputFile(Context &ctx, std::string path, int perm); + void resize(Context &ctx, i64 filesize); + void close(Context &ctx) override; +}; + // // gdb-index.cc // @@ -1048,37 +1191,43 @@ template void write_gdb_index(Context &ctx); // discards the other by eliminating all sections that the other // comdat section refers to. struct ComdatGroup { - ComdatGroup() = default; - ComdatGroup(const ComdatGroup &other) : owner(other.owner.load()) {} - // The file priority of the owner file of this comdat section. - std::atomic_uint32_t owner = -1; + Atomic owner = -1; }; template struct ComdatGroupRef { ComdatGroup *group; - u32 sect_idx; + i32 sect_idx; std::span> members; }; template -struct MergeableSection { +class MergeableSection { +public: + MergeableSection(Context &ctx, MergedSection &parent, + std::unique_ptr> &isec); + + void split_contents(Context &ctx); + void resolve_contents(Context &ctx); std::pair *, i64> get_fragment(i64 offset); + std::string_view get_contents(i64 idx); - MergedSection *parent; - u8 p2align = 0; - std::vector strings; - std::vector hashes; - std::vector frag_offsets; + MergedSection &parent; std::vector *> fragments; + +private: + std::unique_ptr> section; + std::vector frag_offsets; + std::vector hashes; + u8 p2align = 0; }; // InputFile is the base class of ObjectFile and SharedFile. template class InputFile { public: - InputFile(Context &ctx, MappedFile> *mf); + InputFile(Context &ctx, MappedFile *mf); InputFile() : filename("") {} virtual ~InputFile() = default; @@ -1098,7 +1247,6 @@ class InputFile { ElfShdr *find_section(i64 type); virtual void resolve_symbols(Context &ctx) = 0; - void clear_symbols(); virtual void mark_live_objects(Context &ctx, @@ -1107,7 +1255,7 @@ class InputFile { std::span *> get_global_syms(); std::string_view get_source_name() const; - MappedFile> *mf = nullptr; + MappedFile *mf = nullptr; std::span> elf_sections; std::span> elf_syms; std::vector *> symbols; @@ -1115,11 +1263,14 @@ class InputFile { std::string filename; bool is_dso = false; - u32 priority; + i64 priority; Atomic is_alive = false; std::string_view shstrtab; std::string_view symbol_strtab; + bool has_init_array = false; + bool has_ctors = false; + // To create an output .symtab u64 local_symtab_idx = 0; u64 global_symtab_idx = 0; @@ -1156,12 +1307,14 @@ class ObjectFile : public InputFile { public: ObjectFile() = default; - static ObjectFile *create(Context &ctx, MappedFile> *mf, + static ObjectFile *create(Context &ctx, MappedFile *mf, std::string archive_name, bool is_in_lib); void parse(Context &ctx); - void initialize_mergeable_sections(Context &ctx); - void resolve_section_pieces(Context &ctx); + void initialize_symbols(Context &ctx); + void parse_ehframe(Context &ctx); + void convert_mergeable_sections(Context &ctx); + void reattach_section_pieces(Context &ctx); void resolve_symbols(Context &ctx) override; void mark_live_objects(Context &ctx, std::function *)> feeder) override; @@ -1183,18 +1336,17 @@ class ObjectFile : public InputFile { std::vector> fdes; BitVector has_symver; std::vector> comdat_groups; - InputSection *eh_frame_section = nullptr; + std::vector *> eh_frame_sections; bool exclude_libs = false; std::map gnu_properties; - bool is_lto_obj = false; bool needs_executable_stack = false; + bool is_lto_obj = false; + bool is_gcc_offload_obj = false; + bool is_rust_obj = false; - u64 num_dynrel = 0; - u64 reldyn_offset = 0; - - u64 fde_idx = 0; - u64 fde_offset = 0; - u64 fde_size = 0; + i64 fde_idx = 0; + i64 fde_offset = 0; + i64 fde_size = 0; // For ICF std::unique_ptr> llvm_addrsig; @@ -1205,21 +1357,19 @@ class ObjectFile : public InputFile { InputSection *debug_pubtypes = nullptr; // For LTO - std::vector lto_symbol_versions; + std::vector> lto_elf_syms; // Target-specific member [[no_unique_address]] ObjectFileExtras extra; private: - ObjectFile(Context &ctx, MappedFile> *mf, + ObjectFile(Context &ctx, MappedFile *mf, std::string archive_name, bool is_in_lib); void initialize_sections(Context &ctx); - void initialize_symbols(Context &ctx); void sort_relocations(Context &ctx); void initialize_ehframe_sections(Context &ctx); void parse_note_gnu_property(Context &ctx, const ElfShdr &shdr); - void parse_ehframe(Context &ctx); void override_symbol(Context &ctx, Symbol &sym, const ElfSym &esym, i64 symidx); void merge_visibility(Context &ctx, Symbol &sym, u8 visibility); @@ -1234,12 +1384,13 @@ class ObjectFile : public InputFile { template class SharedFile : public InputFile { public: - static SharedFile *create(Context &ctx, MappedFile> *mf); + static SharedFile *create(Context &ctx, MappedFile *mf); void parse(Context &ctx); void resolve_symbols(Context &ctx) override; - std::span *> find_aliases(Symbol *sym); + std::span *> get_symbols_at(Symbol *sym); i64 get_alignment(Symbol *sym); + std::vector get_dt_needed(Context &ctx); bool is_readonly(Symbol *sym); void mark_live_objects(Context &ctx, @@ -1253,33 +1404,32 @@ class SharedFile : public InputFile { std::vector> elf_syms2; private: - SharedFile(Context &ctx, MappedFile> *mf); + SharedFile(Context &ctx, MappedFile *mf) : InputFile(ctx, mf) {} std::string get_soname(Context &ctx); void maybe_override_symbol(Symbol &sym, const ElfSym &esym); + std::vector read_dt_needed(Context &ctx); std::vector read_verdef(Context &ctx); std::vector versyms; const ElfShdr *symtab_sec; - // Used by find_aliases() - std::once_flag init_aliases; - std::vector *> aliases; + // Used by get_symbols_at() + std::once_flag init_sorted_syms; + std::vector *> sorted_syms; }; // // linker-script.cc // -template -void parse_linker_script(Context &ctx, MappedFile> *mf); - -template -std::string_view -get_script_output_type(Context &ctx, MappedFile> *mf); - -template -void parse_version_script(Context &ctx, MappedFile> *mf); +struct ReaderContext { + bool as_needed = false; + bool in_lib = false; + bool static_ = false; + bool whole_archive = false; + tbb::task_group *tg = nullptr; +}; struct DynamicPattern { std::string_view pattern; @@ -1287,6 +1437,48 @@ struct DynamicPattern { bool is_cpp = false; }; +template +class Script { +public: + Script(Context &ctx, ReaderContext &rctx, MappedFile *mf) + : ctx(ctx), rctx(rctx), mf(mf) {} + + std::string_view get_script_output_type(); + void parse_linker_script(); + void parse_version_script(); + std::vector parse_dynamic_list(); + +private: + [[noreturn]] void error(std::string_view pos, std::string msg); + + void tokenize(); + + std::span + skip(std::span tok, std::string_view str); + + std::span read_output_format(std::span tok); + std::span read_group(std::span tok); + + std::span + read_version_script_commands(std::span tok, + std::string_view ver_str, u16 ver_idx, + bool is_global, bool is_cpp); + + std::span read_version_script(std::span tok); + + MappedFile *resolve_path(std::string_view tok, bool check_target); + + std::span + read_dynamic_list_commands(std::span tok, + std::vector &result, bool is_cpp); + + Context &ctx; + ReaderContext &rctx; + MappedFile *mf = mf; + std::once_flag once; + std::vector tokens; +}; + template std::vector parse_dynamic_list(Context &ctx, std::string_view path); @@ -1296,14 +1488,28 @@ parse_dynamic_list(Context &ctx, std::string_view path); // template -ObjectFile *read_lto_object(Context &ctx, MappedFile> *mb); +ObjectFile *read_lto_object(Context &ctx, MappedFile *mb); template -std::vector *> do_lto(Context &ctx); +std::vector *> run_lto_plugin(Context &ctx); template void lto_cleanup(Context &ctx); +// +// shrink-sections.cc +// + +template +void shrink_sections(Context &ctx); + +template +void shrink_section(Context &ctx, InputSection &isec, bool use_rvc); + +template +i64 compute_distance(Context &ctx, Symbol &sym, + InputSection &isec, const ElfRel &rel); + // // gc-sections.cc // @@ -1336,23 +1542,20 @@ void print_map(Context &ctx); // subprocess.cc // -std::function fork_child(); +void fork_child(); +void notify_parent(); template [[noreturn]] void process_run_subcommand(Context &ctx, int argc, char **argv); -// -// jobs.cc -// - -template void acquire_global_lock(Context &ctx); -template void release_global_lock(Context &ctx); - // // cmdline.cc // +template +std::vector expand_response_files(Context &ctx, char **argv); + template std::vector parse_nonpositional_args(Context &ctx); @@ -1366,16 +1569,18 @@ template void apply_exclude_libs(Context &); template void create_synthetic_sections(Context &); template void set_file_priority(Context &); template void resolve_symbols(Context &); -template void kill_eh_frame_sections(Context &); -template void resolve_section_pieces(Context &); +template void do_lto(Context &); +template void parse_eh_frame_sections(Context &); +template void create_merged_sections(Context &); template void convert_common_symbols(Context &); -template void compute_merged_section_sizes(Context &); template void create_output_sections(Context &); template void add_synthetic_symbols(Context &); +template void apply_section_align(Context &); template void check_cet_errors(Context &); template void print_dependencies(Context &); template void write_repro_file(Context &); template void check_duplicate_symbols(Context &); +template void check_shlib_undefined(Context &); template void check_symbol_types(Context &); template void sort_init_fini(Context &); template void sort_ctor_dtor(Context &); @@ -1385,7 +1590,9 @@ template void compute_section_sizes(Context &); template void sort_output_sections(Context &); template void claim_unresolved_symbols(Context &); template void scan_relocations(Context &); +template void compute_imported_symbol_weakness(Context &); template void construct_relr(Context &); +template void sort_dynsyms(Context &); template void create_output_symtab(Context &); template void report_undef_errors(Context &); template void create_reloc_sections(Context &); @@ -1394,20 +1601,49 @@ template void apply_version_script(Context &); template void parse_symbol_version(Context &); template void compute_import_export(Context &); template void compute_address_significance(Context &); -template void clear_padding(Context &); +template void separate_debug_sections(Context &); template void compute_section_headers(Context &); template i64 set_osec_offsets(Context &); template void fix_synthetic_symbols(Context &); -template i64 compress_debug_sections(Context &); +template void compress_debug_sections(Context &); +template void write_build_id(Context &); +template void write_gnu_debuglink(Context &); +template void write_separate_debug_file(Context &ctx); template void write_dependency_file(Context &); template void show_stats(Context &); +// +// arch-x86-64.cc +// + +void rewrite_endbr(Context &ctx); + // // arch-arm32.cc // +class Arm32ExidxSection : public Chunk { +public: + Arm32ExidxSection(OutputSection &osec) : output_section(osec) { + this->name = ".ARM.exidx"; + this->shdr.sh_type = SHT_ARM_EXIDX; + this->shdr.sh_flags = SHF_ALLOC; + this->shdr.sh_addralign = 4; + } + + void compute_section_size(Context &ctx) override; + void update_shdr(Context &ctx) override; + void remove_duplicate_entries(Context &ctx); + void copy_buf(Context &ctx) override; + +private: + std::vector get_contents(Context &ctx); + + OutputSection &output_section; +}; + template <> u64 get_eflags(Context &ctx); -void fixup_arm_exidx_section(Context &ctx); +void create_arm_exidx_section(Context &ctx); // // arch-riscv.cc @@ -1430,9 +1666,6 @@ class RiscvAttributesSection : public Chunk { template u64 get_eflags(Context &ctx); -template -i64 riscv_resize_sections(Context &ctx); - // // arch-ppc64v1.cc // @@ -1462,62 +1695,41 @@ class PPC64OpdSection : public Chunk { // arch-ppc64v2.cc // -template <> u64 get_eflags(Context &ctx); +extern const std::vector> +ppc64_save_restore_insns; -// -// arch-sparc.cc -// - -class SparcTlsGetAddrSection : public Chunk { +class PPC64SaveRestoreSection : public Chunk { public: - SparcTlsGetAddrSection() { - this->name = ".tls_get_addr"; + PPC64SaveRestoreSection() { + this->name = ".save_restore_gprs"; this->shdr.sh_type = SHT_PROGBITS; this->shdr.sh_flags = SHF_ALLOC | SHF_EXECINSTR; - this->shdr.sh_addralign = 4; - this->shdr.sh_size = 24; + this->shdr.sh_addralign = 16; + this->shdr.sh_size = ppc64_save_restore_insns.size() * 4; } - void copy_buf(Context &ctx) override; + void copy_buf(Context &ctx) override; }; -// -// arch-alpha.cc -// - -class AlphaGotSection : public Chunk { -public: - AlphaGotSection() { - this->name = ".alpha_got"; - this->is_relro = true; - this->shdr.sh_type = SHT_PROGBITS; - this->shdr.sh_flags = SHF_ALLOC | SHF_WRITE; - this->shdr.sh_addralign = 8; - } - - void add_symbol(Symbol &sym, i64 addend); - void finalize(); - u64 get_addr(Symbol &sym, i64 addend); - i64 get_reldyn_size(Context &ctx) const override; - void copy_buf(Context &ctx) override; - - struct Entry { - bool operator==(const Entry &) const = default; - Symbol *sym; - i64 addend; - }; - -private: - std::vector entries; - std::mutex mu; -}; +template <> u64 get_eflags(Context &ctx); // // main.cc // struct BuildId { - i64 size() const; + i64 size() const { + switch (kind) { + case HEX: + return value.size(); + case HASH: + return hash_size; + case UUID: + return 16; + default: + unreachable(); + } + } enum { NONE, HEX, HASH, UUID } kind = NONE; std::vector value; @@ -1532,6 +1744,14 @@ typedef enum { UNRESOLVED_IGNORE, } UnresolvedKind; +typedef enum { + BSYMBOLIC_NONE, + BSYMBOLIC_ALL, + BSYMBOLIC_FUNCTIONS, + BSYMBOLIC_NON_WEAK, + BSYMBOLIC_NON_WEAK_FUNCTIONS, +} BsymbolicKind; + typedef enum { SEPARATE_LOADABLE_SEGMENTS, SEPARATE_CODE, @@ -1554,7 +1774,7 @@ struct VersionPattern { std::string_view pattern; std::string_view source; std::string_view ver_str; - u16 ver_idx = -1; + i64 ver_idx = -1; bool is_cpp = false; }; @@ -1568,6 +1788,11 @@ struct SectionOrder { template struct ContextExtras {}; +template <> +struct ContextExtras { + Arm32ExidxSection *exidx = nullptr; +}; + template struct ContextExtras { RiscvAttributesSection *riscv_attributes = nullptr; @@ -1586,19 +1811,14 @@ struct ContextExtras { template <> struct ContextExtras { + PPC64SaveRestoreSection *save_restore = nullptr; Symbol *TOC = nullptr; Atomic is_power10 = false; }; template <> struct ContextExtras { - SparcTlsGetAddrSection *tls_get_addr_sec = nullptr; - Symbol *tls_get_addr_sym = nullptr; -}; - -template <> -struct ContextExtras { - AlphaGotSection *got = nullptr; + Symbol *tls_get_addr = nullptr; }; // Context represents a context object for each invocation of the linker. @@ -1607,7 +1827,15 @@ struct ContextExtras { // resource management, and other miscellaneous objects. template struct Context { - Context() = default; + Context() { + arg.entry = get_symbol(*this, "_start"); + arg.fini = get_symbol(*this, "_fini"); + arg.init = get_symbol(*this, "_init"); + + if constexpr (is_sparc) + extra.tls_get_addr = get_symbol(*this, "__tls_get_addr"); + } + Context(const Context &) = delete; void checkpoint() { @@ -1619,21 +1847,27 @@ struct Context { // Command-line arguments struct { + BsymbolicKind Bsymbolic = BSYMBOLIC_NONE; BuildId build_id; CetReportKind z_cet_report = CET_REPORT_NONE; CompressKind compress_debug_sections = COMPRESS_NONE; + MultiGlob undefined_glob; SeparateCodeKind z_separate_code = NOSEPARATE_CODE; ShuffleSectionsKind shuffle_sections = SHUFFLE_SECTIONS_NONE; - UnresolvedKind unresolved_symbols = UNRESOLVED_ERROR; - bool Bsymbolic = false; - bool Bsymbolic_functions = false; + Symbol *entry = nullptr; + Symbol *fini = nullptr; + Symbol *init = nullptr; + UnresolvedKind unresolved_symbols = UNRESOLVED_IGNORE; bool allow_multiple_definition = false; + bool allow_shlib_undefined = true; bool apply_dynamic_relocs = true; bool color_diagnostics = false; bool default_symver = false; bool demangle = true; + bool detach = true; bool discard_all = false; bool discard_locals = false; + bool dynamic_list_data = false; bool eh_frame_hdr = true; bool emit_relocs = false; bool enable_new_dtags = true; @@ -1648,8 +1882,8 @@ struct Context { bool icf = false; bool icf_all = false; bool ignore_data_address_equality = false; - bool is_static = false; bool lto_pass2 = false; + bool nmagic = false; bool noinhibit_exec = false; bool oformat_binary = false; bool omagic = false; @@ -1669,6 +1903,7 @@ struct Context { bool rosegment = true; bool shared = false; bool start_stop = false; + bool static_ = false; bool stats = false; bool strip_all = false; bool strip_debug = false; @@ -1679,7 +1914,6 @@ struct Context { bool warn_once = false; bool warn_textrel = false; bool z_copyreloc = true; - bool z_defs = false; bool z_delete = true; bool z_dlopen = true; bool z_dump = true; @@ -1695,15 +1929,16 @@ struct Context { bool z_origin = false; bool z_relro = true; bool z_rewrite_endbr = false; + bool z_rodynamic = false; bool z_sectionheader = true; bool z_shstk = false; + bool z_start_stop_visibility_protected = false; bool z_text = false; i64 filler = -1; i64 spare_dynamic_tags = 5; + i64 spare_program_headers = 0; i64 thread_count = 0; i64 z_stack_size = 0; - u64 shuffle_sections_seed; - std::string_view emulation; std::optional unique; std::optional physical_image_base; std::string Map; @@ -1711,16 +1946,15 @@ struct Context { std::string dependency_file; std::string directory; std::string dynamic_linker; - std::string entry = "_start"; - std::string fini = "_fini"; - std::string init = "_init"; std::string output = "a.out"; std::string package_metadata; std::string plugin; std::string rpaths; + std::string separate_debug_file; std::string soname; std::string sysroot; - std::unique_ptr> retain_symbols_file; + std::string_view emulation; + std::optional *>> retain_symbols_file; std::unordered_map section_align; std::unordered_map section_start; std::unordered_set ignore_ir_file; @@ -1736,28 +1970,19 @@ struct Context { std::vector exclude_libs; std::vector filter; std::vector trace_symbol; + u32 z_x86_64_isa_level = 0; u64 image_base = 0x200000; + u64 shuffle_sections_seed = 0; } arg; std::vector version_patterns; std::vector dynamic_list_patterns; i64 default_version = VER_NDX_UNSPECIFIED; i64 page_size = E::page_size; - std::optional global_lock_fd; + bool has_error = false; // Reader context - bool as_needed = false; - bool whole_archive = false; - bool is_static; - bool in_lib = false; i64 file_priority = 10000; - std::unordered_set visited; - tbb::task_group tg; - - bool has_error = false; - bool has_lto_object = false; - Atomic has_init_array = false; - Atomic has_ctors = false; // Symbol table tbb::concurrent_hash_map, HashCmp> symbol_map; @@ -1770,7 +1995,7 @@ struct Context { tbb::concurrent_vector>> obj_pool; tbb::concurrent_vector>> dso_pool; tbb::concurrent_vector> string_pool; - tbb::concurrent_vector>>> mf_pool; + tbb::concurrent_vector> mf_pool; tbb::concurrent_vector>> chunk_pool; tbb::concurrent_vector>> osec_pool; @@ -1788,16 +2013,19 @@ struct Context { std::vector> internal_esyms; // Output buffer - std::unique_ptr>> output_file; + std::unique_ptr> output_file; u8 *buf = nullptr; bool overwrite_output_file = true; std::vector *> chunks; Atomic needs_tlsld = false; Atomic has_textrel = false; - Atomic num_ifunc_dynrels = 0; + Atomic num_ifunc_dynrels = 0; + + tbb::concurrent_hash_map *, std::vector> undef_errors; - tbb::concurrent_hash_map> undef_errors; + // For --separate-debug-file + std::vector *> debug_chunks; // Output chunks OutputEhdr *ehdr = nullptr; @@ -1814,6 +2042,7 @@ struct Context { DynstrSection *dynstr = nullptr; HashSection *hash = nullptr; GnuHashSection *gnu_hash = nullptr; + GnuDebuglinkSection *gnu_debuglink = nullptr; ShstrtabSection *shstrtab = nullptr; PltSection *plt = nullptr; PltGotSection *pltgot = nullptr; @@ -1833,6 +2062,7 @@ struct Context { NotePropertySection *note_property = nullptr; GdbIndexSection *gdb_index = nullptr; RelroPaddingSection *relro_padding = nullptr; + MergedSection *comment = nullptr; [[no_unique_address]] ContextExtras extra; @@ -1878,21 +2108,20 @@ struct Context { }; template -std::string_view get_machine_type(Context &ctx, MappedFile> *mf); +std::string_view +get_machine_type(Context &ctx, ReaderContext &rctx, MappedFile *mf); template -MappedFile> *open_library(Context &ctx, std::string path); +MappedFile *open_library(Context &ctx, ReaderContext &rctx, std::string path); template -MappedFile> *find_library(Context &ctx, std::string path); +MappedFile *find_library(Context &ctx, ReaderContext &rctx, std::string path); template -void read_file(Context &ctx, MappedFile> *mf); +void read_file(Context &ctx, ReaderContext &rctx, MappedFile *mf); template -int elf_main(int argc, char **argv); - -int main(int argc, char **argv); +int mold_main(int argc, char **argv); template std::ostream &operator<<(std::ostream &out, const InputFile &file); @@ -1938,8 +2167,11 @@ template class Symbol { public: Symbol() = default; - Symbol(std::string_view name) : nameptr(name.data()), namelen(name.size()) {} - Symbol(const Symbol &other) : Symbol(other.name()) {} + + Symbol(std::string_view name, bool demangle) + : nameptr(name.data()), namelen(name.size()), demangle(demangle) {} + + Symbol(const Symbol &other) : Symbol(other.name(), other.demangle) {} u64 get_addr(Context &ctx, i64 flags = 0) const; u64 get_got_addr(Context &ctx) const; @@ -2009,7 +2241,7 @@ class Symbol { // A symbol is owned by a file. If two or more files define the // same symbol, the one with the strongest definition owns the symbol. - // If `file` is null, the symbol is equivalent to nonexistent. + // If `file` is null, the symbol is not defined by any input file. InputFile *file = nullptr; // A symbol usually belongs to an input section, but it can belong @@ -2024,6 +2256,14 @@ class Symbol { TAG_MASK = 0b11, }; + // We want to make sure there are enough number of unused bits in + // pointers referring to these structures. In particular, we need + // __attribute__((aligned(4))) for m68k on which int, long, float + // and double are aligned only to two byte boundaries. + static_assert(alignof(InputSection) >= 4); + static_assert(alignof(Chunk) >= 4); + static_assert(alignof(SectionFragment) >= 4); + uintptr_t origin = 0; // `value` contains symbol value. If it's an absolute symbol, it is @@ -2156,63 +2396,44 @@ class Symbol { bool has_copyrel : 1 = false; bool is_copyrel_readonly : 1 = false; + // For symbol resolution. This flag is used rarely. See a comment in + // resolve_symbols(). + bool skip_dso : 1 = false; + + // For --gc-sections + bool gc_root : 1 = false; + // For LTO. True if the symbol is referenced by a regular object (as // opposed to IR object). bool referenced_by_regular_obj : 1 = false; + // For `-z rewrite-endbr` + bool address_taken : 1 = false; + + // If true, we try to dmenagle the sybmol when printing. + bool demangle : 1 = false; + // Target-dependent extra members. [[no_unique_address]] SymbolExtras extra; }; -// If we haven't seen the same `key` before, create a new instance -// of Symbol and returns it. Otherwise, returns the previously- -// instantiated object. `key` is usually the same as `name`. template Symbol *get_symbol(Context &ctx, std::string_view key, - std::string_view name) { - typename decltype(ctx.symbol_map)::const_accessor acc; - ctx.symbol_map.insert(acc, {key, Symbol(name)}); - return const_cast *>(&acc->second); -} + std::string_view name); template -Symbol *get_symbol(Context &ctx, std::string_view name) { - return get_symbol(ctx, name, name); -} +Symbol *get_symbol(Context &ctx, std::string_view name); template -std::ostream &operator<<(std::ostream &out, const Symbol &sym) { - if (opt_demangle) - out << demangle(sym.name()); - else - out << sym.name(); - return out; -} +std::string_view demangle(const Symbol &sym); + +template +std::ostream &operator<<(std::ostream &out, const Symbol &sym); // // Inline objects and functions // -template -inline i64 FdeRecord::size(ObjectFile &file) const { - return *(U32 *)(file.cies[cie_idx].contents.data() + input_offset) + 4; -} - -template -inline std::string_view FdeRecord::get_contents(ObjectFile &file) const { - return file.cies[cie_idx].contents.substr(input_offset, size(file)); -} - -template -inline std::span> -FdeRecord::get_rels(ObjectFile &file) const { - std::span> rels = file.cies[cie_idx].rels; - i64 end = rel_idx; - while (end < rels.size() && rels[end].r_offset < input_offset + size(file)) - end++; - return rels.subspan(rel_idx, end - rel_idx); -} - template inline std::ostream & operator<<(std::ostream &out, const InputSection &isec) { @@ -2220,11 +2441,6 @@ operator<<(std::ostream &out, const InputSection &isec) { return out; } -template -inline u64 SectionFragment::get_addr(Context &ctx) const { - return output_section.shdr.sh_addr + offset; -} - template inline void InputSection::kill() { if (is_alive.exchange(false)) @@ -2240,7 +2456,7 @@ inline u64 InputSection::get_addr() const { template inline std::string_view InputSection::name() const { if (file.elf_sections.size() <= shndx) - return ".common"; + return (shdr().sh_flags & SHF_TLS) ? ".tls_common" : ".common"; return file.shstrtab.data() + file.elf_sections[shndx].sh_name; } @@ -2265,7 +2481,7 @@ i64 get_addend(InputSection &isec, const ElfRel &rel) { template void write_addend(u8 *loc, i64 val, const ElfRel &rel); -template requires E::is_rela +template requires E::is_rela && (!is_sh4) void write_addend(u8 *loc, i64 val, const ElfRel &rel) {} template @@ -2296,18 +2512,25 @@ InputSection::get_fragment(Context &ctx, const ElfRel &rel) { assert(!(shdr().sh_flags & SHF_ALLOC)); const ElfSym &esym = file.elf_syms[rel.r_sym]; + if (esym.is_abs() || esym.is_common() || esym.is_undef()) + return {nullptr, 0}; + + i64 shndx = file.get_shndx(esym); + std::unique_ptr> &m = file.mergeable_sections[shndx]; + if (!m) + return {nullptr, 0}; + if (esym.st_type == STT_SECTION) - if (std::unique_ptr> &m = - file.mergeable_sections[file.get_shndx(esym)]) - return m->get_fragment(esym.st_value + get_addend(*this, rel)); + return m->get_fragment(esym.st_value + get_addend(*this, rel)); - return {nullptr, 0}; + std::pair *, i64> p = m->get_fragment(esym.st_value); + return {p.first, p.second + get_addend(*this, rel)}; } template u64 InputSection::get_thunk_addr(i64 idx) { if constexpr (needs_thunk) { - RangeExtensionRef ref = extra.range_extn[idx]; + ThunkRef ref = extra.thunk_refs[idx]; assert(ref.thunk_idx != -1); return output_section->thunks[ref.thunk_idx]->get_addr(ref.sym_idx); } @@ -2336,45 +2559,44 @@ InputSection::get_tombstone(Symbol &sym, SectionFragment *frag) { if (!isec || isec->is_alive) return {}; - std::string_view s = name(); - if (!s.starts_with(".debug")) + std::string_view str = name(); + if (!str.starts_with(".debug")) return {}; // If the section was dead due to ICF, we don't want to emit debug // info for that section but want to set real values to .debug_line so // that users can set a breakpoint inside a merged section. - if (isec->is_killed_by_icf() && s == ".debug_line") + if (isec->icf_removed() && str == ".debug_line") return {}; // 0 is an invalid value in most debug info sections, so we use it // as a tombstone value. .debug_loc and .debug_ranges reserve 0 as - // the terminator marker, so we use 1 if that's the case. - return (s == ".debug_loc" || s == ".debug_ranges") ? 1 : 0; -} - -template -inline bool -InputSection::is_relr_reloc(Context &ctx, const ElfRel &rel) const { - return ctx.arg.pack_dyn_relocs_relr && - !(shdr().sh_flags & SHF_EXECINSTR) && - (shdr().sh_addralign % sizeof(Word)) == 0 && - (rel.r_offset % sizeof(Word)) == 0; + // the terminator marker, so we use 1 if that'str the case. + return (str == ".debug_loc" || str == ".debug_ranges") ? 1 : 0; } template -inline bool InputSection::is_killed_by_icf() const { +inline bool InputSection::icf_removed() const { return this->leader && this->leader != this; } template std::pair *, i64> MergeableSection::get_fragment(i64 offset) { - std::vector &vec = frag_offsets; + std::span vec = frag_offsets; auto it = std::upper_bound(vec.begin(), vec.end(), offset); i64 idx = it - 1 - vec.begin(); return {fragments[idx], offset - vec[idx]}; } +template +std::string_view MergeableSection::get_contents(i64 i) { + i64 cur = frag_offsets[i]; + if (i == frag_offsets.size() - 1) + return section->contents.substr(cur); + return section->contents.substr(cur, frag_offsets[i + 1] - cur); +} + template template inline std::span @@ -2405,8 +2627,6 @@ InputFile::get_string(Context &ctx, const ElfShdr &shdr) { template inline std::string_view InputFile::get_string(Context &ctx, i64 idx) { - assert(idx < elf_sections.size()); - if (elf_sections.size() <= idx) Fatal(ctx) << *this << ": invalid section index: " << idx; return this->get_string(ctx, elf_sections[idx]); @@ -2424,6 +2644,8 @@ inline i64 ObjectFile::get_shndx(const ElfSym &esym) { if (esym.st_shndx == SHN_XINDEX) return symtab_shndx_sec[&esym - &this->elf_syms[0]]; + if (esym.st_shndx >= SHN_LORESERVE) + return 0; return esym.st_shndx; } @@ -2432,24 +2654,6 @@ inline InputSection *ObjectFile::get_section(const ElfSym &esym) { return sections[get_shndx(esym)].get(); } -template -OutputSection *find_section(Context &ctx, u32 sh_type) { - for (Chunk *chunk : ctx.chunks) - if (OutputSection *osec = chunk->to_osec()) - if (osec->shdr.sh_type == sh_type) - return osec; - return nullptr; -} - -template -OutputSection *find_section(Context &ctx, std::string_view name) { - for (Chunk *chunk : ctx.chunks) - if (OutputSection *osec = chunk->to_osec()) - if (osec->name == name) - return osec; - return nullptr; -} - template u64 Symbol::get_addr(Context &ctx, i64 flags) const { if (SectionFragment *frag = get_frag()) { @@ -2484,7 +2688,7 @@ u64 Symbol::get_addr(Context &ctx, i64 flags) const { return value; // absolute symbol if (!isec->is_alive) { - if (isec->is_killed_by_icf()) + if (isec->icf_removed()) return isec->leader->get_addr() + value; if (isec->name() == ".eh_frame") { @@ -2718,9 +2922,6 @@ inline bool Symbol::has_plt(Context &ctx) const { template inline bool Symbol::is_absolute() const { - if (file && file->is_dso) - return esym().is_abs(); - return !is_imported && !get_frag() && !get_input_section() && !get_output_section(); } @@ -2820,8 +3021,11 @@ inline u32 Symbol::get_type() const { template inline std::string_view Symbol::get_version() const { - if (file->is_dso) - return ((SharedFile *)file)->version_strings[ver_idx]; + if (file->is_dso) { + std::span vers = ((SharedFile *)file)->version_strings; + if (!vers.empty()) + return vers[ver_idx]; + } return ""; } @@ -2878,4 +3082,13 @@ inline bool is_c_identifier(std::string_view s) { return true; } -} // namespace mold::elf +template +std::string_view save_string(Context &ctx, const std::string &str) { + u8 *buf = new u8[str.size() + 1]; + memcpy(buf, str.data(), str.size()); + buf[str.size()] = '\0'; + ctx.string_pool.push_back(std::unique_ptr(buf)); + return {(char *)buf, str.size()}; +} + +} // namespace mold diff --git a/elf/output-chunks.cc b/src/output-chunks.cc similarity index 75% rename from elf/output-chunks.cc rename to src/output-chunks.cc index f0553983..149859ab 100644 --- a/elf/output-chunks.cc +++ b/src/output-chunks.cc @@ -1,5 +1,5 @@ #include "mold.h" -#include "blake3.h" +#include "config.h" #include #include @@ -9,11 +9,7 @@ #include #include -#ifndef _WIN32 -# include -#endif - -namespace mold::elf { +namespace mold { // The hash function for .hash. static u32 elf_hash(std::string_view name) { @@ -28,16 +24,33 @@ static u32 elf_hash(std::string_view name) { return h; } -// The hash function for .gnu.hash. -static u32 djb_hash(std::string_view name) { - u32 h = 5381; - for (u8 c : name) - h = (h << 5) + h + c; - return h; +template +Chunk *find_chunk(Context &ctx, u32 sh_type) { + for (Chunk *chunk : ctx.chunks) + if (chunk->shdr.sh_type == sh_type) + return chunk; + return nullptr; +} + +template +Chunk *find_chunk(Context &ctx, std::string_view name) { + for (Chunk *chunk : ctx.chunks) + if (chunk->name == name) + return chunk; + return nullptr; } template -u64 get_eflags(Context &ctx) { +static u64 get_entry_addr(Context &ctx) { + if (ctx.arg.relocatable) + return 0; + + if (InputFile *file = ctx.arg.entry->file) + if (!file->is_dso) + return ctx.arg.entry->get_addr(ctx); + + if (!ctx.arg.shared) + Warn(ctx) << "entry symbol is not defined: " << *ctx.arg.entry; return 0; } @@ -46,27 +59,13 @@ void OutputEhdr::copy_buf(Context &ctx) { ElfEhdr &hdr = *(ElfEhdr *)(ctx.buf + this->shdr.sh_offset); memset(&hdr, 0, sizeof(hdr)); - auto get_entry_addr = [&]() -> u64 { - if (ctx.arg.relocatable) - return 0; - - if (!ctx.arg.entry.empty()) - if (Symbol *sym = get_symbol(ctx, ctx.arg.entry); - sym->file && !sym->file->is_dso) - return sym->get_addr(ctx); - - if (OutputSection *osec = find_section(ctx, ".text")) - return osec->shdr.sh_addr; - return 0; - }; - memcpy(&hdr.e_ident, "\177ELF", 4); hdr.e_ident[EI_CLASS] = E::is_64 ? ELFCLASS64 : ELFCLASS32; hdr.e_ident[EI_DATA] = E::is_le ? ELFDATA2LSB : ELFDATA2MSB; hdr.e_ident[EI_VERSION] = EV_CURRENT; hdr.e_machine = E::e_machine; hdr.e_version = EV_CURRENT; - hdr.e_entry = get_entry_addr(); + hdr.e_entry = get_entry_addr(ctx); hdr.e_flags = get_eflags(ctx); hdr.e_ehsize = sizeof(ElfEhdr); @@ -109,12 +108,12 @@ void OutputShdr::copy_buf(Context &ctx) { ElfShdr *hdr = (ElfShdr *)(ctx.buf + this->shdr.sh_offset); memset(hdr, 0, this->shdr.sh_size); + if (ctx.shstrtab && SHN_LORESERVE <= ctx.shstrtab->shndx) + hdr[0].sh_link = ctx.shstrtab->shndx; + i64 shnum = ctx.shdr->shdr.sh_size / sizeof(ElfShdr); if (UINT16_MAX < shnum) - hdr->sh_size = shnum; - - if (ctx.shstrtab && SHN_LORESERVE <= ctx.shstrtab->shndx) - hdr->sh_link = ctx.shstrtab->shndx; + hdr[0].sh_size = shnum; for (Chunk *chunk : ctx.chunks) if (chunk->shndx) @@ -149,15 +148,22 @@ template static std::vector> create_phdr(Context &ctx) { std::vector> vec; - auto define = [&](u64 type, u64 flags, i64 min_align, Chunk *chunk) { + auto define = [&](u64 type, u64 flags, Chunk *chunk) { ElfPhdr phdr = {}; phdr.p_type = type; phdr.p_flags = flags; - phdr.p_align = std::max(min_align, chunk->shdr.sh_addralign); - phdr.p_offset = chunk->shdr.sh_offset; - - if (chunk->shdr.sh_type != SHT_NOBITS) + phdr.p_align = chunk->shdr.sh_addralign; + + if (chunk->shdr.sh_type == SHT_NOBITS) { + // p_offset indicates the in-file start offset and is not + // significant for segments with zero on-file size. We still want to + // keep it congruent with the virtual address modulo page size + // because some loaders (at least FreeBSD's) are picky about it. + phdr.p_offset = chunk->shdr.sh_addr % ctx.page_size; + } else { + phdr.p_offset = chunk->shdr.sh_offset; phdr.p_filesz = chunk->shdr.sh_size; + } phdr.p_vaddr = chunk->shdr.sh_addr; phdr.p_paddr = chunk->shdr.sh_addr; @@ -176,8 +182,7 @@ static std::vector> create_phdr(Context &ctx) { }; auto is_bss = [](Chunk *chunk) { - return chunk->shdr.sh_type == SHT_NOBITS && - !(chunk->shdr.sh_flags & SHF_TLS); + return chunk->shdr.sh_type == SHT_NOBITS; }; auto is_tbss = [](Chunk *chunk) { @@ -186,86 +191,90 @@ static std::vector> create_phdr(Context &ctx) { }; auto is_note = [](Chunk *chunk) { - ElfShdr &shdr = chunk->shdr; - return (shdr.sh_type == SHT_NOTE) && (shdr.sh_flags & SHF_ALLOC); + return chunk->shdr.sh_type == SHT_NOTE; }; + // When we are creating PT_LOAD segments, we consider only + // the following chunks. + std::vector *> chunks; + for (Chunk *chunk : ctx.chunks) + if ((chunk->shdr.sh_flags & SHF_ALLOC) && !is_tbss(chunk)) + chunks.push_back(chunk); + + // The ELF spec says that "loadable segment entries in the program + // header table appear in ascending order, sorted on the p_vaddr + // member". + sort(chunks, [](Chunk *a, Chunk *b) { + return a->shdr.sh_addr < b->shdr.sh_addr; + }); + // Create a PT_PHDR for the program header itself. if (ctx.phdr && (ctx.phdr->shdr.sh_flags & SHF_ALLOC)) - define(PT_PHDR, PF_R, sizeof(Word), ctx.phdr); + define(PT_PHDR, PF_R, ctx.phdr); // Create a PT_INTERP. if (ctx.interp) - define(PT_INTERP, PF_R, 1, ctx.interp); + define(PT_INTERP, PF_R, ctx.interp); // Create a PT_NOTE for SHF_NOTE sections. - for (i64 i = 0, end = ctx.chunks.size(); i < end;) { - Chunk *first = ctx.chunks[i++]; - if (!is_note(first)) - continue; - - i64 flags = to_phdr_flags(ctx, first); - i64 alignment = first->shdr.sh_addralign; - define(PT_NOTE, flags, alignment, first); + for (i64 i = 0; i < chunks.size();) { + Chunk *first = chunks[i++]; + if (is_note(first)) { + i64 flags = to_phdr_flags(ctx, first); + define(PT_NOTE, flags, first); - while (i < end && is_note(ctx.chunks[i]) && - to_phdr_flags(ctx, ctx.chunks[i]) == flags) - append(ctx.chunks[i++]); + while (i < chunks.size() && + is_note(ctx.chunks[i]) && + to_phdr_flags(ctx, ctx.chunks[i]) == flags) + append(ctx.chunks[i++]); + } } // Create PT_LOAD segments. - { - i64 idx = vec.size(); - std::vector *> chunks = ctx.chunks; - std::erase_if(chunks, is_tbss); - - for (i64 i = 0, end = chunks.size(); i < end;) { - Chunk *first = chunks[i++]; - if (!(first->shdr.sh_flags & SHF_ALLOC)) - continue; - - i64 flags = to_phdr_flags(ctx, first); - define(PT_LOAD, flags, ctx.page_size, first); - - // Add contiguous ALLOC sections as long as they have the same - // section flags and there's no on-disk gap in between. - if (!is_bss(first)) - while (i < end && !is_bss(chunks[i]) && - to_phdr_flags(ctx, chunks[i]) == flags && - chunks[i]->shdr.sh_offset - first->shdr.sh_offset == - chunks[i]->shdr.sh_addr - first->shdr.sh_addr) - append(chunks[i++]); - - while (i < end && is_bss(chunks[i]) && - to_phdr_flags(ctx, chunks[i]) == flags) + for (i64 i = 0; i < chunks.size();) { + Chunk *first = chunks[i++]; + i64 flags = to_phdr_flags(ctx, first); + define(PT_LOAD, flags, first); + vec.back().p_align = std::max(ctx.page_size, vec.back().p_align); + + // Add contiguous ALLOC sections as long as they have the same + // section flags and there's no on-disk gap in between. + if (!is_bss(first)) + while (i < chunks.size() && + !is_bss(chunks[i]) && + to_phdr_flags(ctx, chunks[i]) == flags && + chunks[i]->shdr.sh_offset - first->shdr.sh_offset == + chunks[i]->shdr.sh_addr - first->shdr.sh_addr) append(chunks[i++]); - } - // The ELF spec says that "loadable segment entries in the program - // header table appear in ascending order, sorted on the p_vaddr - // member". - std::stable_sort(vec.begin() + idx, vec.end(), - [](const ElfPhdr &a, const ElfPhdr &b) { - return a.p_vaddr < b.p_vaddr; - }); + while (i < chunks.size() && + is_bss(chunks[i]) && + to_phdr_flags(ctx, chunks[i]) == flags) + append(chunks[i++]); } // Create a PT_TLS. - for (i64 i = 0; i < ctx.chunks.size(); i++) { - if (ctx.chunks[i]->shdr.sh_flags & SHF_TLS) { - define(PT_TLS, PF_R, 1, ctx.chunks[i++]); - while (i < ctx.chunks.size() && (ctx.chunks[i]->shdr.sh_flags & SHF_TLS)) + for (i64 i = 0; i < ctx.chunks.size();) { + Chunk *first = ctx.chunks[i++]; + if (first->shdr.sh_flags & SHF_TLS) { + define(PT_TLS, PF_R, first); + while (i < ctx.chunks.size() && + (ctx.chunks[i]->shdr.sh_flags & SHF_TLS)) append(ctx.chunks[i++]); } } // Add PT_DYNAMIC if (ctx.dynamic && ctx.dynamic->shdr.sh_size) - define(PT_DYNAMIC, PF_R | PF_W, 1, ctx.dynamic); + define(PT_DYNAMIC, to_phdr_flags(ctx, ctx.dynamic), ctx.dynamic); // Add PT_GNU_EH_FRAME if (ctx.eh_frame_hdr) - define(PT_GNU_EH_FRAME, PF_R, 1, ctx.eh_frame_hdr); + define(PT_GNU_EH_FRAME, PF_R, ctx.eh_frame_hdr); + + // Add PT_GNU_PROPERTY + if (Chunk *chunk = find_chunk(ctx, ".note.gnu.property")) + define(PT_GNU_PROPERTY, PF_R, chunk); // Add PT_GNU_STACK, which is a marker segment that doesn't really // contain any segments. It controls executable bit of stack area. @@ -280,31 +289,31 @@ static std::vector> create_phdr(Context &ctx) { // Create a PT_GNU_RELRO. if (ctx.arg.z_relro) { - for (i64 i = 0; i < ctx.chunks.size(); i++) { - if (!ctx.chunks[i]->is_relro) - continue; - - define(PT_GNU_RELRO, PF_R, 1, ctx.chunks[i++]); - while (i < ctx.chunks.size() && ctx.chunks[i]->is_relro) - append(ctx.chunks[i++]); - vec.back().p_align = 1; + for (i64 i = 0; i < chunks.size();) { + Chunk *first = chunks[i++]; + if (first->is_relro) { + define(PT_GNU_RELRO, PF_R, first); + while (i < chunks.size() && chunks[i]->is_relro) + append(chunks[i++]); + vec.back().p_align = 1; + } } } // Create a PT_ARM_EDXIDX if constexpr (is_arm32) - if (OutputSection *osec = find_section(ctx, SHT_ARM_EXIDX)) - define(PT_ARM_EXIDX, PF_R, 4, osec); + if (ctx.extra.exidx) + define(PT_ARM_EXIDX, PF_R, ctx.extra.exidx); // Create a PT_RISCV_ATTRIBUTES if constexpr (is_riscv) if (ctx.extra.riscv_attributes->shdr.sh_size) - define(PT_RISCV_ATTRIBUTES, PF_R, 1, ctx.extra.riscv_attributes); + define(PT_RISCV_ATTRIBUTES, PF_R, ctx.extra.riscv_attributes); // Create a PT_OPENBSD_RANDOMIZE for (Chunk *chunk : ctx.chunks) if (chunk->name == ".openbsd.randomdata") - define(PT_OPENBSD_RANDOMIZE, PF_R | PF_W, 1, chunk); + define(PT_OPENBSD_RANDOMIZE, PF_R | PF_W, chunk); // Set p_paddr if --physical-image-base was given. --physical-image-base // is typically used in embedded programming to specify the base address @@ -352,6 +361,7 @@ static std::vector> create_phdr(Context &ctx) { } } + vec.resize(vec.size() + ctx.arg.spare_program_headers); return vec; } @@ -360,9 +370,14 @@ void OutputPhdr::update_shdr(Context &ctx) { phdrs = create_phdr(ctx); this->shdr.sh_size = phdrs.size() * sizeof(ElfPhdr); - ctx.tls_begin = get_tls_begin(ctx); - ctx.tp_addr = get_tp_addr(ctx); - ctx.dtp_addr = get_dtp_addr(ctx); + for (ElfPhdr &phdr : phdrs) { + if (phdr.p_type == PT_TLS) { + ctx.tls_begin = phdr.p_vaddr; + ctx.tp_addr = get_tp_addr(phdr); + ctx.dtp_addr = get_dtp_addr(phdr); + break; + } + } } template @@ -389,11 +404,6 @@ void RelDynSection::update_shdr(Context &ctx) { offset += chunk->get_reldyn_size(ctx) * sizeof(ElfRel); } - for (ObjectFile *file : ctx.objs) { - file->reldyn_offset = offset; - offset += file->num_dynrel * sizeof(ElfRel); - } - this->shdr.sh_size = offset; this->shdr.sh_link = ctx.dynsym->shndx; } @@ -403,7 +413,7 @@ void RelDynSection::sort(Context &ctx) { Timer t(ctx, "sort_dynamic_relocs"); ElfRel *begin = (ElfRel *)(ctx.buf + this->shdr.sh_offset); - ElfRel *end = (ElfRel *)((u8 *)begin + this->shdr.sh_size); + ElfRel *end = begin + this->shdr.sh_size / sizeof(ElfRel); auto get_rank = [](u32 r_type) { if (r_type == E::R_RELATIVE) @@ -467,7 +477,7 @@ void StrtabSection::update_shdr(Context &ctx) { // affect correctness of the program but helps disassembler to // disassemble machine code appropriately. if constexpr (is_arm32) - if (!ctx.arg.strip_all && !ctx.arg.retain_symbols_file) + if (!ctx.arg.strip_all) offset += sizeof("$a\0$t\0$d"); for (Chunk *chunk : ctx.chunks) { @@ -494,7 +504,7 @@ void StrtabSection::copy_buf(Context &ctx) { buf[0] = '\0'; if constexpr (is_arm32) - if (!ctx.arg.strip_all && !ctx.arg.retain_symbols_file) + if (!ctx.arg.strip_all) memcpy(buf + 1, "$a\0$t\0$d", 9); } @@ -504,7 +514,7 @@ void ShstrtabSection::update_shdr(Context &ctx) { i64 offset = 1; for (Chunk *chunk : ctx.chunks) { - if (chunk->kind() != ChunkKind::HEADER && !chunk->name.empty()) { + if (!chunk->is_header() && !chunk->name.empty()) { auto [it, inserted] = map.insert({chunk->name, offset}); chunk->shdr.sh_name = it->second; if (inserted) @@ -521,7 +531,7 @@ void ShstrtabSection::copy_buf(Context &ctx) { base[0] = '\0'; for (Chunk *chunk : ctx.chunks) - if (chunk->kind() != ChunkKind::HEADER && !chunk->name.empty()) + if (chunk->shdr.sh_name) write_string(base + chunk->shdr.sh_name, chunk->name); } @@ -554,17 +564,13 @@ void DynstrSection::copy_buf(Context &ctx) { u8 *base = ctx.buf + this->shdr.sh_offset; base[0] = '\0'; - for (std::pair pair : strings) - write_string(base + pair.second, pair.first); + for (std::pair p : strings) + write_string(base + p.second, p.first); - if (!ctx.dynsym->symbols.empty()) { - i64 offset = dynsym_offset; - - for (i64 i = 1; i < ctx.dynsym->symbols.size(); i++) { - Symbol &sym = *ctx.dynsym->symbols[i]; - offset += write_string(base + offset, sym.name()); - } - } + i64 off = dynsym_offset; + for (Symbol *sym : ctx.dynsym->symbols) + if (sym) + off += write_string(base + off, sym->name()); } template @@ -732,19 +738,19 @@ static std::vector> create_dynamic_section(Context &ctx) { define(DT_STRSZ, ctx.dynstr->shdr.sh_size); } - if (find_section(ctx, SHT_INIT_ARRAY)) { + if (find_chunk(ctx, SHT_INIT_ARRAY)) { define(DT_INIT_ARRAY, ctx.__init_array_start->value); define(DT_INIT_ARRAYSZ, ctx.__init_array_end->value - ctx.__init_array_start->value); } - if (find_section(ctx, SHT_PREINIT_ARRAY)) { + if (find_chunk(ctx, SHT_PREINIT_ARRAY)) { define(DT_PREINIT_ARRAY, ctx.__preinit_array_start->value); define(DT_PREINIT_ARRAYSZ, ctx.__preinit_array_end->value - ctx.__preinit_array_start->value); } - if (find_section(ctx, SHT_FINI_ARRAY)) { + if (find_chunk(ctx, SHT_FINI_ARRAY)) { define(DT_FINI_ARRAY, ctx.__fini_array_start->value); define(DT_FINI_ARRAYSZ, ctx.__fini_array_end->value - ctx.__fini_array_start->value); @@ -763,13 +769,13 @@ static std::vector> create_dynamic_section(Context &ctx) { define(DT_VERDEFNUM, ctx.verdef->shdr.sh_info); } - if (Symbol *sym = get_symbol(ctx, ctx.arg.init); - sym->file && !sym->file->is_dso) - define(DT_INIT, sym->get_addr(ctx)); + if (Symbol &sym = *ctx.arg.init; + sym.file && !sym.file->is_dso) + define(DT_INIT, sym.get_addr(ctx)); - if (Symbol *sym = get_symbol(ctx, ctx.arg.fini); - sym->file && !sym->file->is_dso) - define(DT_FINI, sym->get_addr(ctx)); + if (Symbol &sym = *ctx.arg.fini; + sym.file && !sym.file->is_dso) + define(DT_FINI, sym.get_addr(ctx)); if (ctx.hash) define(DT_HASH, ctx.hash->shdr.sh_addr); @@ -833,7 +839,7 @@ static std::vector> create_dynamic_section(Context &ctx) { // GDB needs a DT_DEBUG entry in an executable to store a word-size // data for its own purpose. Its content is not important. - if (!ctx.arg.shared) + if (!ctx.arg.shared && !ctx.arg.z_rodynamic) define(DT_DEBUG, 0); define(DT_NULL, 0); @@ -846,7 +852,7 @@ static std::vector> create_dynamic_section(Context &ctx) { template void DynamicSection::update_shdr(Context &ctx) { - if (ctx.arg.is_static && !ctx.arg.pie) + if (ctx.arg.static_ && !ctx.arg.pie) return; this->shdr.sh_size = create_dynamic_section(ctx).size() * sizeof(Word); @@ -856,50 +862,104 @@ void DynamicSection::update_shdr(Context &ctx) { template void DynamicSection::copy_buf(Context &ctx) { std::vector> contents = create_dynamic_section(ctx); - assert(this->shdr.sh_size == contents.size() * sizeof(contents[0])); + assert(this->shdr.sh_size == contents.size() * sizeof(Word)); write_vector(ctx.buf + this->shdr.sh_offset, contents); } +template +static std::vector> split(std::vector &input, i64 unit) { + std::span span(input); + std::vector> vec; + + while (span.size() >= unit) { + vec.push_back(span.subspan(0, unit)); + span = span.subspan(unit); + } + if (!span.empty()) + vec.push_back(span); + return vec; +} + + +// Assign offsets to OutputSection members template -OutputSection::OutputSection(Context &ctx, std::string_view name, - u32 type, u64 flags) { - this->name = name; - this->shdr.sh_type = type; - this->shdr.sh_flags = flags & ~SHF_MERGE & ~SHF_STRINGS; +void OutputSection::compute_section_size(Context &ctx) { + ElfShdr &shdr = this->shdr; - if (auto it = ctx.arg.section_align.find(name); - it != ctx.arg.section_align.end()) - this->shdr.sh_addralign = it->second; + // On most RISC systems, we need to create so-called "range extension + // thunks" to extend branch instructions reach, as their jump + // instructions' reach is limited. create_range_extension_thunks() + // computes the size of the section while inserting thunks. + if constexpr (needs_thunk) { + if ((shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable) { + create_range_extension_thunks(ctx); + return; + } + } - // PT_GNU_RELRO segment is a security mechanism to make more pages - // read-only than we could have done without it. - // - // Traditionally, sections are either read-only or read-write. If a - // section contains dynamic relocations, it must have been put into a - // read-write segment so that the program loader can mutate its - // contents in memory, even if no one will write to it at runtime. - // - // RELRO segment allows us to make such pages writable only when a - // program is being loaded. After that, the page becomes read-only. - // - // Some sections, such as .init, .fini, .got, .dynamic, contain - // dynamic relocations but doesn't have to be writable at runtime, - // so they are put into a RELRO segment. - this->is_relro = (name == ".toc" || name.ends_with(".rel.ro") || - type == SHT_INIT_ARRAY || type == SHT_FINI_ARRAY || - type == SHT_PREINIT_ARRAY || (flags & SHF_TLS)); + // Since one output section may contain millions of input sections, + // we first split input sections into groups and assign offsets to + // groups. + struct Group { + std::span *> members; + i64 size = 0; + i64 p2align = 0; + i64 offset = 0; + }; + + std::span *> mem = members; + std::vector groups; + constexpr i64 group_size = 10000; + + while (!mem.empty()) { + i64 sz = std::min(group_size, mem.size()); + groups.push_back({mem.subspan(0, sz)}); + mem = mem.subspan(sz); + } + + tbb::parallel_for_each(groups, [](Group &group) { + for (InputSection *isec : group.members) { + group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size; + group.p2align = std::max(group.p2align, isec->p2align); + } + }); + + shdr.sh_size = 0; + + for (i64 i = 0; i < groups.size(); i++) { + shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align); + groups[i].offset = shdr.sh_size; + shdr.sh_size += groups[i].size; + shdr.sh_addralign = std::max(shdr.sh_addralign, 1 << groups[i].p2align); + } + + // Assign offsets to input sections. + tbb::parallel_for_each(groups, [](Group &group) { + i64 offset = group.offset; + for (InputSection *isec : group.members) { + offset = align_to(offset, 1 << isec->p2align); + isec->offset = offset; + offset += isec->sh_size; + } + }); } template void OutputSection::copy_buf(Context &ctx) { - if (this->shdr.sh_type != SHT_NOBITS) - write_to(ctx, ctx.buf + this->shdr.sh_offset); + if (this->shdr.sh_type != SHT_NOBITS) { + ElfRel *rel = nullptr; + if (ctx.reldyn) + rel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + + this->reldyn_offset); + + write_to(ctx, ctx.buf + this->shdr.sh_offset, rel); + } } template -void OutputSection::write_to(Context &ctx, u8 *buf) { +void OutputSection::write_to(Context &ctx, u8 *buf, ElfRel *rel) { + // Copy section contents to an output file. tbb::parallel_for((i64)0, (i64)members.size(), [&](i64 i) { - // Copy section contents to an output file. InputSection &isec = *members[i]; isec.write_to(ctx, buf + isec.offset); @@ -924,12 +984,46 @@ void OutputSection::write_to(Context &ctx, u8 *buf) { } }); + // Emit range extension thunks. if constexpr (needs_thunk) { - tbb::parallel_for_each(thunks, - [&](std::unique_ptr> &thunk) { + tbb::parallel_for_each(thunks, [&](std::unique_ptr> &thunk) { thunk->copy_buf(ctx); }); } + + // Emit dynamic relocations. + for (AbsRel &r : abs_rels) { + Word *loc = (Word *)(buf + r.isec->offset + r.offset); + u64 addr = this->shdr.sh_addr + r.isec->offset + r.offset; + Symbol &sym = *r.sym; + + switch (r.kind) { + case ABS_REL_NONE: + case ABS_REL_RELR: + *loc = sym.get_addr(ctx) + r.addend; + break; + case ABS_REL_BASEREL: { + u64 val = sym.get_addr(ctx) + r.addend; + *rel++ = ElfRel(addr, E::R_RELATIVE, 0, val); + if (ctx.arg.apply_dynamic_relocs) + *loc = val; + break; + } + case ABS_REL_IFUNC: + if constexpr (supports_ifunc) { + u64 val = sym.get_addr(ctx, NO_PLT) + r.addend; + *rel++ = ElfRel(addr, E::R_IRELATIVE, 0, val); + if (ctx.arg.apply_dynamic_relocs) + *loc = val; + } + break; + case ABS_REL_DYNREL: + *rel++ = ElfRel(addr, E::R_ABS, sym.get_dynsym_idx(ctx), r.addend); + if (ctx.arg.apply_dynamic_relocs) + *loc = r.addend; + break; + } + } } // .relr.dyn contains base relocations encoded in a space-efficient form. @@ -951,14 +1045,16 @@ void OutputSection::write_to(Context &ctx, u8 *buf) { // the .rel.dyn section). A bitmap has LSB 1. template static std::vector encode_relr(std::span pos) { + for (i64 i = 0; i < pos.size(); i++) { + assert(pos[i] % sizeof(Word) == 0); + assert(i == 0 || pos[i - 1] < pos[i]); + } + std::vector vec; - i64 num_bits = sizeof(Word) * 8 - 1; + i64 num_bits = E::is_64 ? 63 : 31; i64 max_delta = sizeof(Word) * num_bits; for (i64 i = 0; i < pos.size();) { - assert(i == 0 || pos[i - 1] <= pos[i]); - assert(pos[i] % sizeof(Word) == 0); - vec.push_back(pos[i]); u64 base = pos[i] + sizeof(Word); i++; @@ -966,7 +1062,7 @@ static std::vector encode_relr(std::span pos) { for (;;) { u64 bits = 0; for (; i < pos.size() && pos[i] - base < max_delta; i++) - bits |= 1LL << ((pos[i] - base) / sizeof(Word)); + bits |= (u64)1 << ((pos[i] - base) / sizeof(Word)); if (!bits) break; @@ -979,36 +1075,92 @@ static std::vector encode_relr(std::span pos) { } template -void OutputSection::construct_relr(Context &ctx) { - if (!ctx.arg.pic) - return; - if (!(this->shdr.sh_flags & SHF_ALLOC)) - return; - if (this->shdr.sh_addralign % sizeof(Word)) - return; +static AbsRelKind get_abs_rel_kind(Context &ctx, Symbol &sym) { + if (sym.is_ifunc()) + return sym.is_pde_ifunc(ctx) ? ABS_REL_NONE : ABS_REL_IFUNC; - // Skip it if it is a text section because .text doesn't usually - // contain any dynamic relocations. - if (this->shdr.sh_flags & SHF_EXECINSTR) - return; + if (sym.is_absolute()) + return ABS_REL_NONE; - // Collect base relocations - std::vector> shards(members.size()); + // True if the symbol's address is in the output file. + if (!sym.is_imported || (sym.flags & NEEDS_CPLT) || (sym.flags & NEEDS_COPYREL)) + return ctx.arg.pic ? ABS_REL_BASEREL : ABS_REL_NONE; - tbb::parallel_for((i64)0, (i64)members.size(), [&](i64 i) { - InputSection &isec = *members[i]; - if ((1 << isec.p2align) < sizeof(Word)) - return; + return ABS_REL_DYNREL; +} - for (const ElfRel &r : isec.get_rels(ctx)) - if (r.r_type == E::R_ABS && (r.r_offset % sizeof(Word)) == 0) - if (Symbol &sym = *isec.file.symbols[r.r_sym]; - !sym.is_absolute() && !sym.is_imported) - shards[i].push_back(isec.offset + r.r_offset); +// Scan word-size absolute relocations (e.g. R_X86_64_64). This is +// separated from scan_relocations() because only such relocations can +// be promoted to dynamic relocations. +template +void OutputSection::scan_abs_relocations(Context &ctx) { + std::vector>> shards(members.size()); + + // Collect all word-size absolute relocations + tbb::parallel_for((i64)0, (i64)members.size(), [&](i64 i) { + InputSection *isec = members[i]; + for (const ElfRel &r : isec->get_rels(ctx)) + if (r.r_type == E::R_ABS) + shards[i].push_back(AbsRel{isec, r.r_offset, isec->file.symbols[r.r_sym], + get_addend(*isec, r)}); }); - // Compress them - std::vector pos = flatten(shards); + abs_rels = flatten(shards); + + // We can sometimes avoid creating dynamic relocations in read-only + // sections by promoting symbols to canonical PLT or copy relocations. + if (!ctx.arg.pic && !(this->shdr.sh_flags & SHF_WRITE)) + for (AbsRel &r : abs_rels) + if (Symbol &sym = *r.sym; + sym.is_imported && !sym.is_absolute()) + sym.flags |= (sym.get_type() == STT_FUNC) ? NEEDS_CPLT : NEEDS_COPYREL; + + // Now we can compute whether they need to be promoted to dynamic + // relocations or not. + for (AbsRel &r : abs_rels) + r.kind = get_abs_rel_kind(ctx, *r.sym); + + // If we have a relocation against a read-only section, we need to + // set the DT_TEXTREL flag for the loader. + for (AbsRel &r : abs_rels) { + if (r.kind != ABS_REL_NONE && !(r.isec->shdr().sh_flags & SHF_WRITE)) { + if (ctx.arg.z_text) { + Error(ctx) << *r.isec << ": relocation at offset 0x" + << std::hex << r.offset << " against symbol `" + << *r.sym << "' can not be used; recompile with -fPIC"; + } else if (ctx.arg.warn_textrel) { + Warn(ctx) << *r.isec << ": relocation against symbol `" << *r.sym + << "' in read-only section"; + } + ctx.has_textrel = true; + } + } + + // If --pack-dyn-relocs=relr is enabled, base relocations are put into + // .relr.dyn. + if (ctx.arg.pack_dyn_relocs_relr) + for (AbsRel &r : abs_rels) + if (r.kind == ABS_REL_BASEREL && + r.isec->shdr().sh_addralign % sizeof(Word) == 0 && + r.offset % sizeof(Word) == 0) + r.kind = ABS_REL_RELR; +} + +template +i64 OutputSection::get_reldyn_size(Context &ctx) const { + i64 n = 0; + for (const AbsRel &r : abs_rels) + if (r.kind != ABS_REL_NONE && r.kind != ABS_REL_RELR) + n++; + return n; +} + +template +void OutputSection::construct_relr(Context &ctx) { + std::vector pos; + for (const AbsRel &r : abs_rels) + if (r.kind == ABS_REL_RELR) + pos.push_back(r.isec->offset + r.offset); this->relr = encode_relr(pos); } @@ -1019,7 +1171,7 @@ void OutputSection::compute_symtab_size(Context &ctx) { this->strtab_size = 0; this->num_local_symtab = 0; - for (std::unique_ptr> &thunk : thunks) { + for (std::unique_ptr> &thunk : thunks) { // For ARM32, we emit additional symbol "$t", "$a" and "$d" for // each thunk to mark the beginning of Thumb code, ARM code and // data, respectively. @@ -1039,9 +1191,6 @@ void OutputSection::compute_symtab_size(Context &ctx) { // disassembling and/or debugging our output. template void OutputSection::populate_symtab(Context &ctx) { - if (this->num_local_symtab == 0) - return; - if constexpr (needs_thunk) { ElfSym *esym = (ElfSym *)(ctx.buf + ctx.symtab->shdr.sh_offset) + this->local_symtab_idx; @@ -1058,7 +1207,7 @@ void OutputSection::populate_symtab(Context &ctx) { esym++; }; - for (std::unique_ptr> &thunk : thunks) { + for (std::unique_ptr> &thunk : thunks) { for (i64 i = 0; i < thunk->symbols.size(); i++) { Symbol &sym = *thunk->symbols[i]; u64 addr = thunk->get_addr(i); @@ -1116,7 +1265,7 @@ void GotSection::add_tlsdesc_symbol(Context &ctx, Symbol *sym) { // statically-linked executable), we always relax TLSDESC relocations // so that no TLSDESC relocation exist at runtime. assert(supports_tlsdesc); - assert(!ctx.arg.is_static); + assert(!ctx.arg.static_); sym->set_tlsdesc_idx(ctx, this->shdr.sh_size / sizeof(Word)); this->shdr.sh_size += sizeof(Word) * 2; @@ -1287,11 +1436,11 @@ void GotSection::copy_buf(Context &ctx) { buf[0] = ctx.dynamic->shdr.sh_addr; // arm64 psABI doesn't say anything about GOT[0], but glibc/arm64's code - // path for -static-pie wrongly assumed that GOT[0] refers _DYNAMIC. + // path for -static-pie wrongly assumed that GOT[0] refers to _DYNAMIC. // // https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=43d06ed218fc8be5 if constexpr (is_arm64) - if (ctx.dynamic && ctx.arg.is_static && ctx.arg.pie) + if (ctx.dynamic && ctx.arg.static_ && ctx.arg.pie) buf[0] = ctx.dynamic->shdr.sh_addr; ElfRel *rel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + @@ -1300,30 +1449,31 @@ void GotSection::copy_buf(Context &ctx) { for (GotEntry &ent : get_got_entries(ctx)) { if (ent.is_relr(ctx) || ent.r_type == R_NONE) { buf[ent.idx] = ent.val; - } else { - *rel++ = ElfRel(this->shdr.sh_addr + ent.idx * sizeof(Word), - ent.r_type, - ent.sym ? ent.sym->get_dynsym_idx(ctx) : 0, - ent.val); - - bool is_tlsdesc = false; - if constexpr (supports_tlsdesc) - is_tlsdesc = (ent.r_type == E::R_TLSDESC); - - if (ctx.arg.apply_dynamic_relocs) { - if (is_tlsdesc && !is_arm32) { - // A single TLSDESC relocation fixes two consecutive GOT slots - // where one slot holds a function pointer and the other an - // argument to the function. An addend should be applied not to - // the function pointer but to the function argument, which is - // usually stored to the second slot. - // - // ARM32 employs the inverted layout for some reason, so an - // addend is applied to the first slot. - buf[ent.idx + 1] = ent.val; - } else { - buf[ent.idx] = ent.val; - } + continue; + } + + *rel++ = ElfRel(this->shdr.sh_addr + ent.idx * sizeof(Word), + ent.r_type, + ent.sym ? ent.sym->get_dynsym_idx(ctx) : 0, + ent.val); + + bool is_tlsdesc = false; + if constexpr (supports_tlsdesc) + is_tlsdesc = (ent.r_type == E::R_TLSDESC); + + if (ctx.arg.apply_dynamic_relocs) { + if (is_tlsdesc && !is_arm32) { + // A single TLSDESC relocation fixes two consecutive GOT slots + // where one slot holds a function pointer and the other an + // argument to the function. An addend should be applied not to + // the function pointer but to the function argument, which is + // usually stored to the second slot. + // + // ARM32 employs the inverted layout for some reason, so an + // addend is applied to the first slot. + buf[ent.idx + 1] = ent.val; + } else { + buf[ent.idx] = ent.val; } } } @@ -1331,13 +1481,10 @@ void GotSection::copy_buf(Context &ctx) { template void GotSection::construct_relr(Context &ctx) { - assert(ctx.arg.pack_dyn_relocs_relr); - std::vector pos; for (GotEntry &ent : get_got_entries(ctx)) if (ent.is_relr(ctx)) pos.push_back(ent.idx * sizeof(Word)); - this->relr = encode_relr(pos); } @@ -1440,7 +1587,6 @@ void GotPltSection::copy_buf(Context &ctx) { template void PltSection::add_symbol(Context &ctx, Symbol *sym) { assert(!sym->has_plt(ctx)); - sym->set_plt_idx(ctx, symbols.size()); symbols.push_back(sym); ctx.dynsym->add_symbol(ctx, sym); @@ -1625,9 +1771,6 @@ ElfSym to_output_esym(Context &ctx, Symbol &sym, u32 st_name, if constexpr (is_ppc64v2) esym.ppc_local_entry = sym.esym().ppc_local_entry; - if constexpr (is_alpha) - esym.alpha_st_other = sym.esym().alpha_st_other; - auto get_st_shndx = [&](Symbol &sym) -> u32 { if (SectionFragment *frag = sym.get_frag()) if (frag->is_alive) @@ -1640,7 +1783,7 @@ ElfSym to_output_esym(Context &ctx, Symbol &sym, u32 st_name, if (InputSection *isec = sym.get_input_section()) { if (isec->is_alive) return isec->output_section->shndx; - else if (isec->is_killed_by_icf()) + if (isec->icf_removed()) return isec->leader->output_section->shndx; } @@ -1648,11 +1791,16 @@ ElfSym to_output_esym(Context &ctx, Symbol &sym, u32 st_name, }; i64 shndx = -1; + InputSection *isec = sym.get_input_section(); + if (sym.has_copyrel) { + // Symbol in .copyrel shndx = sym.is_copyrel_readonly ? ctx.copyrel_relro->shndx : ctx.copyrel->shndx; esym.st_value = sym.get_addr(ctx); } else if (sym.file->is_dso || sym.esym().is_undef()) { + // Undefined symbol in a DSO esym.st_shndx = SHN_UNDEF; + esym.st_size = 0; if (sym.is_canonical) esym.st_value = sym.get_plt_addr(ctx); } else if (Chunk *osec = sym.get_output_section()) { @@ -1663,7 +1811,7 @@ ElfSym to_output_esym(Context &ctx, Symbol &sym, u32 st_name, // Section fragment shndx = frag->output_section.shndx; esym.st_value = sym.get_addr(ctx); - } else if (!sym.get_input_section()) { + } else if (!isec) { // Absolute symbol esym.st_shndx = SHN_ABS; esym.st_value = sym.get_addr(ctx); @@ -1676,8 +1824,23 @@ ElfSym to_output_esym(Context &ctx, Symbol &sym, u32 st_name, shndx = get_st_shndx(sym); esym.st_type = STT_FUNC; esym.st_visibility = sym.visibility; - esym.st_value = sym.get_addr(ctx); + esym.st_value = sym.get_plt_addr(ctx); + } else if ((isec->shdr().sh_flags & SHF_MERGE) && + !(isec->shdr().sh_flags & SHF_ALLOC)) { + // Symbol in a mergeable non-SHF_ALLOC section, such as .debug_str + ObjectFile *file = (ObjectFile *)sym.file; + MergeableSection &m = + *file->mergeable_sections[file->get_shndx(sym.esym())]; + + SectionFragment *frag; + i64 frag_addend; + std::tie(frag, frag_addend) = m.get_fragment(sym.esym().st_value); + + shndx = m.parent.shndx; + esym.st_visibility = sym.visibility; + esym.st_value = frag->get_addr(ctx) + frag_addend; } else { + // Symbol in a regular section shndx = get_st_shndx(sym); esym.st_visibility = sym.visibility; esym.st_value = sym.get_addr(ctx, NO_PLT); @@ -1701,8 +1864,6 @@ ElfSym to_output_esym(Context &ctx, Symbol &sym, u32 st_name, template void DynsymSection::add_symbol(Context &ctx, Symbol *sym) { - assert(!finalized); - if (symbols.empty()) symbols.resize(1); @@ -1712,67 +1873,6 @@ void DynsymSection::add_symbol(Context &ctx, Symbol *sym) { } } -template -void DynsymSection::finalize(Context &ctx) { - Timer t(ctx, "DynsymSection::finalize"); - assert(!finalized); - finalized = true; - - if (symbols.empty()) - return; - - // Sort symbols. In any symtab, local symbols must precede global symbols. - auto first_global = std::stable_partition(symbols.begin() + 1, symbols.end(), - [&](Symbol *sym) { - return sym->is_local(ctx); - }); - - // We also place undefined symbols before defined symbols for .gnu.hash. - // Defined symbols are sorted by their hashes for .gnu.hash. - if (ctx.gnu_hash) { - // Count the number of exported symbols to compute the size of .gnu.hash. - i64 num_exported = 0; - for (i64 i = 1; i < symbols.size(); i++) - if (symbols[i]->is_exported) - num_exported++; - - u32 num_buckets = num_exported / ctx.gnu_hash->LOAD_FACTOR + 1; - ctx.gnu_hash->num_buckets = num_buckets; - - tbb::parallel_for((i64)(first_global - symbols.begin()), (i64)symbols.size(), - [&](i64 i) { - Symbol &sym = *symbols[i]; - sym.set_dynsym_idx(ctx, i); - sym.set_djb_hash(ctx, djb_hash(sym.name())); - }); - - tbb::parallel_sort(first_global, symbols.end(), - [&](Symbol *a, Symbol *b) { - if (a->is_exported != b->is_exported) - return b->is_exported; - - u32 h1 = a->get_djb_hash(ctx) % num_buckets; - u32 h2 = b->get_djb_hash(ctx) % num_buckets; - return std::tuple(h1, a->get_dynsym_idx(ctx)) < - std::tuple(h2, b->get_dynsym_idx(ctx)); - }); - } - - // Compute .dynstr size - ctx.dynstr->dynsym_offset = ctx.dynstr->shdr.sh_size; - - tbb::enumerable_thread_specific size; - tbb::parallel_for((i64)1, (i64)symbols.size(), [&](i64 i) { - symbols[i]->set_dynsym_idx(ctx, i); - size.local() += symbols[i]->name().size() + 1; - }); - - ctx.dynstr->shdr.sh_size += size.combine(std::plus()); - - // ELF's symbol table sh_info holds the offset of the first global symbol. - this->shdr.sh_info = first_global - symbols.begin(); -} - template void DynsymSection::update_shdr(Context &ctx) { this->shdr.sh_link = ctx.dynstr->shndx; @@ -1831,34 +1931,20 @@ void HashSection::copy_buf(Context &ctx) { } } -template -std::span *> -GnuHashSection::get_exported_symbols(Context &ctx) { - std::span *> syms = ctx.dynsym->symbols; - auto it = std::partition_point(syms.begin() + 1, syms.end(), [](Symbol *sym) { - return !sym->is_exported; - }); - return syms.subspan(it - syms.begin()); -} - template void GnuHashSection::update_shdr(Context &ctx) { if (ctx.dynsym->symbols.empty()) return; - this->shdr.sh_link = ctx.dynsym->shndx; - - i64 num_exported = get_exported_symbols(ctx).size(); - if (num_exported) { - // We allocate 12 bits for each symbol in the bloom filter. - i64 num_bits = num_exported * 12; - num_bloom = bit_ceil(num_bits / (sizeof(Word) * 8)); - } + // We allocate 12 bits for each symbol in the bloom filter. + num_bloom = bit_ceil((num_exported * 12) / (sizeof(Word) * 8)); this->shdr.sh_size = HEADER_SIZE; // Header this->shdr.sh_size += num_bloom * sizeof(Word); // Bloom filter this->shdr.sh_size += num_buckets * 4; // Hash buckets this->shdr.sh_size += num_exported * 4; // Hash values + + this->shdr.sh_link = ctx.dynsym->shndx; } template @@ -1866,12 +1952,15 @@ void GnuHashSection::copy_buf(Context &ctx) { u8 *base = ctx.buf + this->shdr.sh_offset; memset(base, 0, this->shdr.sh_size); - std::span *> syms = get_exported_symbols(ctx); - std::vector indices(syms.size()); - i64 exported_offset = ctx.dynsym->symbols.size() - syms.size(); + i64 first_exported = ctx.dynsym->symbols.size() - num_exported; + + std::span *> syms = ctx.dynsym->symbols; + syms = syms.subspan(first_exported); + + std::vector indices(num_exported); *(U32 *)base = num_buckets; - *(U32 *)(base + 4) = exported_offset; + *(U32 *)(base + 4) = first_exported; *(U32 *)(base + 8) = num_bloom; *(U32 *)(base + 12) = BLOOM_SHIFT; @@ -1881,7 +1970,7 @@ void GnuHashSection::copy_buf(Context &ctx) { for (i64 i = 0; i < syms.size(); i++) { constexpr i64 word_bits = sizeof(Word) * 8; - i64 h = syms[i]->get_djb_hash(ctx); + u32 h = syms[i]->get_djb_hash(ctx); indices[i] = h % num_buckets; i64 idx = (h / word_bits) % num_bloom; @@ -1894,7 +1983,7 @@ void GnuHashSection::copy_buf(Context &ctx) { for (i64 i = 0; i < syms.size(); i++) if (!buckets[indices[i]]) - buckets[indices[i]] = i + exported_offset; + buckets[indices[i]] = i + first_exported; // Write a hash table U32 *table = buckets + num_buckets; @@ -1922,14 +2011,11 @@ get_merged_output_name(Context &ctx, std::string_view name, u64 flags, // GCC seems to create sections named ".rodata.strN..M". // We want to eliminate the symbol name part from the section name. if ((flags & SHF_STRINGS) && name.starts_with(".rodata.")) { - if (entsize == 1 && addralign == 1) - return ".rodata.str1.1"; - if (entsize == 2 && addralign == 2) - return ".rodata.str2.2"; - if (entsize == 4 && addralign == 4) - return ".rodata.str4.4"; - return save_string(ctx,".rodata.str"s + std::to_string(entsize) + "." + - std::to_string(addralign)); + std::string name2 = ".rodata.str"s + std::to_string(entsize) + + "." + std::to_string(addralign); + if (name == name2) + return name; + return save_string(ctx, name2); } return name; @@ -1947,16 +2033,26 @@ MergedSection::MergedSection(std::string_view name, i64 flags, i64 type, template MergedSection * MergedSection::get_instance(Context &ctx, std::string_view name, - i64 type, i64 flags, - i64 entsize, i64 addralign) { + const ElfShdr &shdr) { + if (!(shdr.sh_flags & SHF_MERGE)) + return nullptr; + + i64 addralign = std::max(1, shdr.sh_addralign); + i64 flags = shdr.sh_flags & ~(u64)SHF_GROUP & ~(u64)SHF_COMPRESSED; + + i64 entsize = shdr.sh_entsize; + if (entsize == 0) + entsize = (shdr.sh_flags & SHF_STRINGS) ? 1 : (i64)shdr.sh_addralign; + if (entsize == 0) + return nullptr; + name = get_merged_output_name(ctx, name, flags, entsize, addralign); - flags = flags & ~(u64)SHF_GROUP & ~(u64)SHF_COMPRESSED; auto find = [&]() -> MergedSection * { for (std::unique_ptr> &osec : ctx.merged_sections) - if (std::tuple(name, flags, type, entsize) == - std::tuple(osec->name, osec->shdr.sh_flags, osec->shdr.sh_type, - osec->shdr.sh_entsize)) + if (name == osec->name && flags == osec->shdr.sh_flags && + shdr.sh_type == osec->shdr.sh_type && + entsize == osec->shdr.sh_entsize) return osec.get(); return nullptr; }; @@ -1974,7 +2070,7 @@ MergedSection::get_instance(Context &ctx, std::string_view name, if (MergedSection *osec = find()) return osec; - MergedSection *osec = new MergedSection(name, flags, type, entsize); + MergedSection *osec = new MergedSection(name, flags, shdr.sh_type, entsize); ctx.merged_sections.emplace_back(osec); return osec; } @@ -1983,26 +2079,67 @@ template SectionFragment * MergedSection::insert(Context &ctx, std::string_view data, u64 hash, i64 p2align) { - std::call_once(once_flag, [&] { - // We aim 2/3 occupation ratio - map.resize(estimator.get_cardinality() * 3 / 2); - }); - // Even if GC is enabled, we garbage-collect only memory-mapped strings. // Non-memory-allocated strings are typically identifiers used by debug info. // To remove such strings, use the `strip` command. bool is_alive = !ctx.arg.gc_sections || !(this->shdr.sh_flags & SHF_ALLOC); - SectionFragment *frag; - bool inserted; - std::tie(frag, inserted) = - map.insert(data, hash, SectionFragment(this, is_alive)); + SectionFragment *frag = + map.insert(data, hash, SectionFragment(this, is_alive)).first; update_maximum(frag->p2align, p2align); return frag; } template -void MergedSection::assign_offsets(Context &ctx) { +static std::string get_cmdline_args(Context &ctx) { + std::stringstream ss; + ss << ctx.cmdline_args[1]; + for (i64 i = 2; i < ctx.cmdline_args.size(); i++) + ss << " " << ctx.cmdline_args[i]; + return ss.str(); +} + +// Add strings to .comment +template +static void add_comment_strings(Context &ctx) { + auto add = [&](std::string str) { + std::string_view buf = save_string(ctx, str); + std::string_view data(buf.data(), buf.size() + 1); + ctx.comment->insert(ctx, data, hash_string(data), 0); + }; + + // Add an identification string to .comment. + add(get_mold_version()); + + // Embed command line arguments for debugging. + char *env = getenv("MOLD_DEBUG"); + if (env && env[0]) + add("mold command line: " + get_cmdline_args(ctx)); +} + +template +void MergedSection::resolve(Context &ctx) { + tbb::parallel_for_each(members, [&](MergeableSection *sec) { + sec->split_contents(ctx); + }); + + // We aim 2/3 occupation ratio + map.resize(estimator.get_cardinality() * 3 / 2); + + tbb::parallel_for_each(members, [&](MergeableSection *sec) { + sec->resolve_contents(ctx); + }); + + if (this == ctx.comment) + add_comment_strings(ctx); + resolved = true; +} + +template +void MergedSection::compute_section_size(Context &ctx) { + if (!resolved) + resolve(ctx); + std::vector sizes(map.NUM_SHARDS); Atomic alignment = 1; @@ -2047,26 +2184,31 @@ void MergedSection::assign_offsets(Context &ctx) { this->shdr.sh_size = shard_offsets[map.NUM_SHARDS]; this->shdr.sh_addralign = alignment; + + if (this->shdr.sh_size > UINT32_MAX) + Fatal(ctx) << this->name << ": output section too large"; } template void MergedSection::copy_buf(Context &ctx) { - write_to(ctx, ctx.buf + this->shdr.sh_offset); + write_to(ctx, ctx.buf + this->shdr.sh_offset, nullptr); } template -void MergedSection::write_to(Context &ctx, u8 *buf) { +void MergedSection::write_to(Context &ctx, u8 *buf, ElfRel *rel) { i64 shard_size = map.nbuckets / map.NUM_SHARDS; tbb::parallel_for((i64)0, map.NUM_SHARDS, [&](i64 i) { - memset(buf + shard_offsets[i], 0, shard_offsets[i + 1] - shard_offsets[i]); + // There might be gaps between strings to satisfy alignment requirements. + // If that's the case, we need to zero-clear them. + if (this->shdr.sh_addralign > 1) + memset(buf + shard_offsets[i], 0, shard_offsets[i + 1] - shard_offsets[i]); + // Copy strings for (i64 j = shard_size * i; j < shard_size * (i + 1); j++) - if (const char *key = map.entries[j].key) { - SectionFragment &frag = map.entries[j].value; - if (frag.is_alive) + if (const char *key = map.entries[j].key) + if (SectionFragment &frag = map.entries[j].value; frag.is_alive) memcpy(buf + frag.offset, key, map.entries[j].keylen); - } }); } @@ -2077,23 +2219,15 @@ void MergedSection::print_stats(Context &ctx) { if (map.entries[i].key) used++; - SyncOut(ctx) << this->name - << " estimation=" << estimator.get_cardinality() - << " actual=" << used; + Out(ctx) << this->name + << " estimation=" << estimator.get_cardinality() + << " actual=" << used; } template void EhFrameSection::construct(Context &ctx) { Timer t(ctx, "eh_frame"); - // If .eh_frame is missing in all input files, we don't want to - // create an output .eh_frame section. - if (std::all_of(ctx.objs.begin(), ctx.objs.end(), - [](ObjectFile *file) { return file->cies.empty(); })) { - this->shdr.sh_size = 0; - return; - } - // Remove dead FDEs and assign them offsets within their corresponding // CIE group. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { @@ -2111,7 +2245,7 @@ void EhFrameSection::construct(Context &ctx) { std::vector *> leaders; auto find_leader = [&](CieRecord &cie) -> CieRecord * { for (CieRecord *leader : leaders) - if (cie.equals(*leader)) + if (cie_equals(*leader, cie)) return leader; return nullptr; }; @@ -2154,9 +2288,9 @@ void EhFrameSection::copy_buf(Context &ctx) { I32 fde_addr; }; - HdrEntry *eh_hdr_begin = nullptr; + HdrEntry *eh_hdr = nullptr; if (ctx.eh_frame_hdr) - eh_hdr_begin = (HdrEntry *)(ctx.buf + ctx.eh_frame_hdr->shdr.sh_offset + + eh_hdr = (HdrEntry *)(ctx.buf + ctx.eh_frame_hdr->shdr.sh_offset + EhFrameHdrSection::HEADER_SIZE); tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { @@ -2184,6 +2318,7 @@ void EhFrameSection::copy_buf(Context &ctx) { // Copy FDEs. for (i64 i = 0; i < file->fdes.size(); i++) { FdeRecord &fde = file->fdes[i]; + std::span> rels = fde.get_rels(*file); i64 offset = file->fde_offset + fde.output_offset; std::string_view contents = fde.get_contents(*file); @@ -2195,23 +2330,24 @@ void EhFrameSection::copy_buf(Context &ctx) { if (ctx.arg.relocatable) continue; - bool is_first = true; - for (const ElfRel &rel : fde.get_rels(*file)) { + for (const ElfRel &rel : rels) { assert(rel.r_offset - fde.input_offset < contents.size()); Symbol &sym = *file->symbols[rel.r_sym]; u64 loc = offset + rel.r_offset - fde.input_offset; u64 val = sym.get_addr(ctx) + get_addend(cie.input_section, rel); apply_eh_reloc(ctx, rel, loc, val); + } - if (eh_hdr_begin && is_first) { - // Write to .eh_frame_hdr - HdrEntry &ent = eh_hdr_begin[file->fde_idx + i]; - u64 sh_addr = ctx.eh_frame_hdr->shdr.sh_addr; - ent.init_addr = val - sh_addr; - ent.fde_addr = this->shdr.sh_addr + offset - sh_addr; - is_first = false; - } + if (eh_hdr) { + // Write to .eh_frame_hdr + Symbol &sym = *file->symbols[rels[0].r_sym]; + u64 val = sym.get_addr(ctx) + get_addend(cie.input_section, rels[0]); + u64 sh_addr = ctx.eh_frame_hdr->shdr.sh_addr; + + HdrEntry &ent = eh_hdr[file->fde_idx + i]; + ent.init_addr = val - sh_addr; + ent.fde_addr = this->shdr.sh_addr + offset - sh_addr; } } }); @@ -2220,8 +2356,8 @@ void EhFrameSection::copy_buf(Context &ctx) { *(U32 *)(base + this->shdr.sh_size - 4) = 0; // Sort .eh_frame_hdr contents. - if (eh_hdr_begin) { - tbb::parallel_sort(eh_hdr_begin, eh_hdr_begin + ctx.eh_frame_hdr->num_fdes, + if (eh_hdr) { + tbb::parallel_sort(eh_hdr, eh_hdr + ctx.eh_frame_hdr->num_fdes, [](const HdrEntry &a, const HdrEntry &b) { return a.init_addr < b.init_addr; }); @@ -2324,6 +2460,16 @@ void CopyrelSection::add_symbol(Context &ctx, Symbol *sym) { assert(!ctx.arg.shared); assert(sym->file->is_dso); + if (sym->esym().st_visibility == STV_PROTECTED) + Error(ctx) << *sym->file + << ": cannot create a copy relocation for protected symbol '" + << *sym << "'; recompile with -fPIC"; + + if (!ctx.arg.z_copyreloc) + Error(ctx) << "-z nocopyreloc: " << *sym->file + << ": cannot create a copy relocation for symbol '" << *sym + << "'; recompile with -fPIC"; + symbols.push_back(sym); SharedFile &file = *(SharedFile *)sym->file; @@ -2339,7 +2485,7 @@ void CopyrelSection::add_symbol(Context &ctx, Symbol *sym) { // For example, `environ`, `_environ` and `__environ` in libc.so are // aliases. If one of the symbols is copied by a copy relocation, other // symbols have to refer to the copied place as well. - for (Symbol *sym2 : file.find_aliases(sym)) { + for (Symbol *sym2 : file.get_symbols_at(sym)) { sym2->add_aux(ctx); sym2->is_imported = true; sym2->is_exported = true; @@ -2350,29 +2496,19 @@ void CopyrelSection::add_symbol(Context &ctx, Symbol *sym) { } } -template -void CopyrelSection::update_shdr(Context &ctx) { - // SHT_NOBITS sections (i.e. BSS sections) have to be at the end of - // a segment, so a .copyrel.rel.ro usually requires one extra - // segment for it. We turn a .copyrel.rel.ro into a regular section - // if it is very small to avoid the cost of the extra segment. - if (this->is_relro && ctx.arg.z_relro && this->shdr.sh_size < E::page_size) - this->shdr.sh_type = SHT_PROGBITS; -} - template void CopyrelSection::copy_buf(Context &ctx) { - if (this->shdr.sh_type == SHT_PROGBITS) - memset(ctx.buf + this->shdr.sh_offset, 0, this->shdr.sh_size); - ElfRel *rel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + this->reldyn_offset); for (Symbol *sym : symbols) - *rel++ = ElfRel(sym->get_addr(ctx), E::R_COPY, sym->get_dynsym_idx(ctx), - 0); + *rel++ = ElfRel(sym->get_addr(ctx), E::R_COPY, + sym->get_dynsym_idx(ctx), 0); } +// .gnu.version section contains version indices as a parallel array for +// .dynsym. If a dynamic symbol is a defined one, its version information +// is in .gnu.version_d. Otherwise, it's in .gnu.version_r. template void VersymSection::update_shdr(Context &ctx) { this->shdr.sh_size = contents.size() * sizeof(contents[0]); @@ -2384,6 +2520,30 @@ void VersymSection::copy_buf(Context &ctx) { write_vector(ctx.buf + this->shdr.sh_offset, contents); } +// If `-z pack-relative-relocs` is specified, we'll create a .relr.dyn +// section and store base relocation records to that section instead of +// to the usual .rela.dyn section. +// +// .relr.dyn is relatively new feature and not supported by glibc until +// 2.38 which was released in 2022. If we don't do anything, executables +// built with `-z pack-relative-relocs` would just crash immediately on +// startup with an older version of glibc. +// +// As a workaround, we'll add a dependency to a dummy version name +// "GLIBC_ABI_DT_RELR" if `-z pack-relative-relocs` is given so that +// executables built with the option failed with a more friendly "version +// `GLIBC_ABI_DT_RELR' not found" error message. glibc 2.38 or later knows +// about this dummy version name and simply ignores it. +template +static InputFile *find_glibc2(Context &ctx) { + for (Symbol *sym : ctx.dynsym->symbols) + if (sym && sym->file->is_dso && + ((SharedFile *)sym->file)->soname.starts_with("libc.so.") && + sym->get_version().starts_with("GLIBC_2.")) + return sym->file; + return nullptr; +} + template void VerneedSection::construct(Context &ctx) { Timer t(ctx, "fill_verneed"); @@ -2406,8 +2566,8 @@ void VerneedSection::construct(Context &ctx) { }); // Resize .gnu.version - ctx.versym->contents.resize(ctx.dynsym->symbols.size(), 1); - ctx.versym->contents[0] = 0; + ctx.versym->contents.resize(ctx.dynsym->symbols.size(), VER_NDX_GLOBAL); + ctx.versym->contents[0] = VER_NDX_LOCAL; // Allocate a large enough buffer for .gnu.version_r. contents.resize((sizeof(ElfVerneed) + sizeof(ElfVernaux)) * @@ -2419,7 +2579,7 @@ void VerneedSection::construct(Context &ctx) { ElfVerneed *verneed = nullptr; ElfVernaux *aux = nullptr; - u16 veridx = VER_NDX_LAST_RESERVED + ctx.arg.version_definitions.size(); + i64 veridx = VER_NDX_LAST_RESERVED + ctx.arg.version_definitions.size(); auto start_group = [&](InputFile *file) { this->shdr.sh_info++; @@ -2440,7 +2600,7 @@ void VerneedSection::construct(Context &ctx) { if (aux) aux->vna_next = sizeof(ElfVernaux); aux = (ElfVernaux *)ptr; - ptr += sizeof(*aux); + ptr += sizeof(ElfVernaux); aux->vna_hash = elf_hash(verstr); aux->vna_other = ++veridx; @@ -2460,29 +2620,7 @@ void VerneedSection::construct(Context &ctx) { } if (ctx.arg.pack_dyn_relocs_relr) { - // If `-z pack-relative-relocs` is specified, we'll create a .relr.dyn - // section and store base relocation records to that section instead of - // to the usual .rela.dyn section. - // - // .relr.dyn is relatively new feature and not supported by glibc until - // 2.38 which was released in 2022. Executables built with `-z - // pack-relative-relocs` don't work and usually crash immediately on - // startup if libc doesn't support it. - // - // In the following code, we'll add a dependency to a dummy version name - // "GLIBC_ABI_DT_RELR" so that executables built with the option failed - // with a more friendly "version `GLIBC_ABI_DT_RELR' not found" error - // message. glibc 2.38 or later knows about this dummy version name and - // simply ignores it. - auto find_glibc2 = [&]() -> InputFile * { - for (Symbol *sym : syms) - if (((SharedFile *)sym->file)->soname.starts_with("libc.so.") && - sym->get_version().starts_with("GLIBC_2.")) - return sym->file; - return nullptr; - }; - - if (InputFile *file = find_glibc2()) { + if (InputFile *file = find_glibc2(ctx)) { start_group(file); add_entry("GLIBC_ABI_DT_RELR"); } @@ -2510,16 +2648,20 @@ void VerdefSection::construct(Context &ctx) { if (ctx.arg.version_definitions.empty()) return; - // Resize .gnu.version - ctx.versym->contents.resize(ctx.dynsym->symbols.size(), 1); - ctx.versym->contents[0] = 0; + // Resize .gnu.version and write to it + ctx.versym->contents.resize(ctx.dynsym->symbols.size(), VER_NDX_GLOBAL); + ctx.versym->contents[0] = VER_NDX_LOCAL; + + for (i64 i = 1; i < ctx.dynsym->symbols.size(); i++) + if (Symbol &sym = *ctx.dynsym->symbols[i]; + !sym.file->is_dso && sym.ver_idx != VER_NDX_UNSPECIFIED) + ctx.versym->contents[sym.get_dynsym_idx(ctx)] = sym.ver_idx; - // Allocate a buffer for .gnu.version_d. + // Allocate a buffer for .gnu.version_d and write to it contents.resize((sizeof(ElfVerdef) + sizeof(ElfVerdaux)) * (ctx.arg.version_definitions.size() + 1)); - u8 *buf = (u8 *)&contents[0]; - u8 *ptr = buf; + u8 *ptr = (u8 *)contents.data(); ElfVerdef *verdef = nullptr; auto write = [&](std::string_view verstr, i64 idx, i64 flags) { @@ -2542,20 +2684,14 @@ void VerdefSection::construct(Context &ctx) { aux->vda_name = ctx.dynstr->add_string(verstr); }; - std::string_view basename = ctx.arg.soname.empty() ? - ctx.arg.output : ctx.arg.soname; - write(basename, 1, VER_FLG_BASE); + if (!ctx.arg.soname.empty()) + write(ctx.arg.soname, 1, VER_FLG_BASE); + else + write(ctx.arg.output, 1, VER_FLG_BASE); - i64 idx = 2; + i64 idx = VER_NDX_LAST_RESERVED + 1; for (std::string_view verstr : ctx.arg.version_definitions) write(verstr, idx++, 0); - - for (Symbol *sym : std::span *>(ctx.dynsym->symbols).subspan(1)) { - i64 ver = sym->ver_idx; - if (ver == VER_NDX_UNSPECIFIED) - ver = VER_NDX_GLOBAL; - ctx.versym->contents[sym->get_dynsym_idx(ctx)] = ver; - } } template @@ -2569,101 +2705,21 @@ void VerdefSection::copy_buf(Context &ctx) { write_vector(ctx.buf + this->shdr.sh_offset, contents); } -inline i64 BuildId::size() const { - switch (kind) { - case HEX: - return value.size(); - case HASH: - return hash_size; - case UUID: - return 16; - default: - unreachable(); - } -} - template void BuildIdSection::update_shdr(Context &ctx) { - this->shdr.sh_size = HEADER_SIZE + ctx.arg.build_id.size(); + this->shdr.sh_size = ctx.arg.build_id.size() + 16; // +16 for the header } template void BuildIdSection::copy_buf(Context &ctx) { U32 *base = (U32 *)(ctx.buf + this->shdr.sh_offset); memset(base, 0, this->shdr.sh_size); - base[0] = 4; // Name size - base[1] = ctx.arg.build_id.size(); // Hash size - base[2] = NT_GNU_BUILD_ID; // Type - memcpy(base + 3, "GNU", 4); // Name string -} - -// BLAKE3 is a cryptographic hash function just like SHA256. -// We use it instead of SHA256 because it's faster. -static void blake3_hash(u8 *buf, i64 size, u8 *out) { - blake3_hasher hasher; - blake3_hasher_init(&hasher); - blake3_hasher_update(&hasher, buf, size); - blake3_hasher_finalize(&hasher, out, BLAKE3_OUT_LEN); -} - -template -static void compute_blake3(Context &ctx, i64 offset) { - u8 *buf = ctx.buf; - i64 filesize = ctx.output_file->filesize; - - i64 shard_size = 4096 * 1024; - i64 num_shards = align_to(filesize, shard_size) / shard_size; - std::vector shards(num_shards * BLAKE3_OUT_LEN); - - tbb::parallel_for((i64)0, num_shards, [&](i64 i) { - u8 *begin = buf + shard_size * i; - u8 *end = (i == num_shards - 1) ? buf + filesize : begin + shard_size; - blake3_hash(begin, end - begin, shards.data() + i * BLAKE3_OUT_LEN); -#ifndef _WIN32 - // We call munmap early for each chunk so that the last munmap - // gets cheaper. We assume that the .note.build-id section is - // at the beginning of an output file. This is an ugly performance - // hack, but we can save about 30 ms for a 2 GiB output. - if (i > 0 && ctx.output_file->is_mmapped) - munmap(begin, end - begin); -#endif - }); - - assert(ctx.arg.build_id.size() <= BLAKE3_OUT_LEN); - - u8 digest[BLAKE3_OUT_LEN]; - blake3_hash(shards.data(), shards.size(), digest); - memcpy(buf + offset, digest, ctx.arg.build_id.size()); - -#ifndef _WIN32 - if (ctx.output_file->is_mmapped) { - munmap(buf, std::min(filesize, shard_size)); - ctx.output_file->is_unmapped = true; - } -#endif -} - -template -void BuildIdSection::write_buildid(Context &ctx) { - Timer t(ctx, "build_id"); - - switch (ctx.arg.build_id.kind) { - case BuildId::HEX: - write_vector(ctx.buf + this->shdr.sh_offset + HEADER_SIZE, - ctx.arg.build_id.value); - return; - case BuildId::HASH: - compute_blake3(ctx, this->shdr.sh_offset + HEADER_SIZE); - return; - case BuildId::UUID: { - std::array uuid = get_uuid_v4(); - memcpy(ctx.buf + this->shdr.sh_offset + HEADER_SIZE, uuid.data(), 16); - return; - } - default: - unreachable(); - } + base[0] = 4; // Name size + base[1] = ctx.arg.build_id.size(); // Hash size + base[2] = NT_GNU_BUILD_ID; // Type + memcpy(base + 3, "GNU", 4); // Name string + write_vector(base + 4, contents); // Build ID } template @@ -2747,6 +2803,8 @@ void NotePropertySection::update_shdr(Context &ctx) { if (ctx.arg.z_shstk) properties[GNU_PROPERTY_X86_FEATURE_1_AND] |= GNU_PROPERTY_X86_FEATURE_1_SHSTK; + properties[GNU_PROPERTY_X86_ISA_1_NEEDED] |= ctx.arg.z_x86_64_isa_level; + std::erase_if(properties, [](std::pair kv) { return kv.second == 0; }); @@ -2785,16 +2843,16 @@ CompressedSection::CompressedSection(Context &ctx, Chunk &chunk) { this->uncompressed_data.resize(chunk.shdr.sh_size); u8 *buf = this->uncompressed_data.data(); - chunk.write_to(ctx, buf); + chunk.write_to(ctx, buf, nullptr); switch (ctx.arg.compress_debug_sections) { case COMPRESS_ZLIB: chdr.ch_type = ELFCOMPRESS_ZLIB; - compressed.reset(new ZlibCompressor(buf, chunk.shdr.sh_size)); + compressor.reset(new ZlibCompressor(buf, chunk.shdr.sh_size)); break; case COMPRESS_ZSTD: chdr.ch_type = ELFCOMPRESS_ZSTD; - compressed.reset(new ZstdCompressor(buf, chunk.shdr.sh_size)); + compressor.reset(new ZstdCompressor(buf, chunk.shdr.sh_size)); break; default: unreachable(); @@ -2806,7 +2864,7 @@ CompressedSection::CompressedSection(Context &ctx, Chunk &chunk) { this->shdr = chunk.shdr; this->shdr.sh_flags |= SHF_COMPRESSED; this->shdr.sh_addralign = 1; - this->shdr.sh_size = sizeof(chdr) + compressed->compressed_size; + this->shdr.sh_size = sizeof(chdr) + compressor->compressed_size; this->shndx = chunk.shndx; // We don't need to keep the original data unless --gdb-index is given. @@ -2820,7 +2878,7 @@ template void CompressedSection::copy_buf(Context &ctx) { u8 *base = ctx.buf + this->shdr.sh_offset; memcpy(base, &chdr, sizeof(chdr)); - compressed->write_to(base + sizeof(chdr)); + compressor->write_to(base + sizeof(chdr)); } template @@ -2865,37 +2923,41 @@ void RelocSection::update_shdr(Context &ctx) { template void RelocSection::copy_buf(Context &ctx) { - auto write = [&](ElfRel &out, InputSection &isec, const ElfRel &rel) { + auto get_symidx_addend = [&](InputSection &isec, const ElfRel &rel) + -> std::pair { Symbol &sym = *isec.file.symbols[rel.r_sym]; - i64 symidx = 0; - i64 addend = 0; + + if (!(isec.shdr().sh_flags & SHF_ALLOC)) { + SectionFragment *frag; + i64 frag_addend; + std::tie(frag, frag_addend) = isec.get_fragment(ctx, rel); + if (frag) + return {frag->output_section.shndx, frag->offset + frag_addend}; + } if (sym.esym().st_type == STT_SECTION) { - if (SectionFragment *frag = sym.get_frag()) { - symidx = frag->output_section.shndx; - addend = frag->offset + sym.value + get_addend(isec, rel); - } else { - InputSection *target = sym.get_input_section(); - - if (OutputSection *osec = target->output_section) { - symidx = osec->shndx; - addend = get_addend(isec, rel) + target->offset; - } else if (isec.name() == ".eh_frame") { - symidx = ctx.eh_frame->shndx; - addend = get_addend(isec, rel); - } else { - // This is usually a dead debug section referring a - // COMDAT-eliminated section. - } - } - } else if (sym.write_to_symtab) { - symidx = sym.get_output_sym_idx(ctx); - addend = get_addend(isec, rel); + if (SectionFragment *frag = sym.get_frag()) + return {frag->output_section.shndx, + frag->offset + sym.value + get_addend(isec, rel)}; + + InputSection *isec2 = sym.get_input_section(); + if (OutputSection *osec = isec2->output_section) + return {osec->shndx, get_addend(isec, rel) + isec2->offset}; + + // This is usually a dead debug section referring to a + // COMDAT-eliminated section. + return {0, 0}; } - if constexpr (is_alpha) - if (rel.r_type == R_ALPHA_GPDISP || rel.r_type == R_ALPHA_LITUSE) - addend = rel.r_addend; + if (sym.write_to_symtab) + return {sym.get_output_sym_idx(ctx), get_addend(isec, rel)}; + return {0, 0}; + }; + + auto write = [&](ElfRel &out, InputSection &isec, const ElfRel &rel) { + i64 symidx; + i64 addend; + std::tie(symidx, addend) = get_symidx_addend(isec, rel); i64 r_offset = isec.output_section->shdr.sh_addr + isec.offset + rel.r_offset; out = ElfRel(r_offset, rel.r_type, symidx, addend); @@ -2935,6 +2997,20 @@ void ComdatGroupSection::copy_buf(Context &ctx) { *buf++ = chunk->shndx; } +template +void GnuDebuglinkSection::update_shdr(Context &ctx) { + filename = std::filesystem::path(ctx.arg.separate_debug_file).filename().string(); + this->shdr.sh_size = align_to(filename.size() + 1, 4) + 4; +} + +template +void GnuDebuglinkSection::copy_buf(Context &ctx) { + u8 *buf = ctx.buf + this->shdr.sh_offset; + memset(buf, 0, this->shdr.sh_size); + write_string(buf, filename); + *(U32 *)(buf + this->shdr.sh_size - 4) = crc32; +} + using E = MOLD_TARGET; template class Chunk; @@ -2973,7 +3049,11 @@ template class GdbIndexSection; template class CompressedSection; template class RelocSection; template class ComdatGroupSection; +template class GnuDebuglinkSection; + +template Chunk *find_chunk(Context &, u32); +template Chunk *find_chunk(Context &, std::string_view); template i64 to_phdr_flags(Context &ctx, Chunk *chunk); template ElfSym to_output_esym(Context &, Symbol &, u32, U32 *); -} // namespace mold::elf +} // namespace mold diff --git a/src/output-file-unix.cc b/src/output-file-unix.cc new file mode 100644 index 00000000..0a6f9eb2 --- /dev/null +++ b/src/output-file-unix.cc @@ -0,0 +1,200 @@ +#include "mold.h" + +#include +#include +#include +#include +#include +#include + +namespace mold { + +static u32 get_umask() { + u32 orig_umask = umask(0); + umask(orig_umask); + return orig_umask; +} + +template +static int +open_or_create_file(Context &ctx, std::string path, std::string tmpfile, + int perm) { + // Reuse an existing file if exists and writable because on Linux, + // writing to an existing file is much faster than creating a fresh + // file and writing to it. + if (ctx.overwrite_output_file && rename(path.c_str(), tmpfile.c_str()) == 0) { + i64 fd = ::open(tmpfile.c_str(), O_RDWR | O_CREAT, perm); + if (fd != -1) + return fd; + unlink(tmpfile.c_str()); + } + + i64 fd = ::open(tmpfile.c_str(), O_RDWR | O_CREAT, perm); + if (fd == -1) + Fatal(ctx) << "cannot open " << tmpfile << ": " << errno_string(); + return fd; +} + +template +class MemoryMappedOutputFile : public OutputFile { +public: + MemoryMappedOutputFile(Context &ctx, std::string path, i64 filesize, int perm) + : OutputFile(path, filesize, true) { + std::filesystem::path dir = filepath(path).parent_path(); + std::string filename = filepath(path).filename().string(); + std::string tmpfile = dir / ("." + filename + "." + std::to_string(getpid())); + + this->fd = open_or_create_file(ctx, path, tmpfile, perm); + + if (fchmod(this->fd, perm & ~get_umask()) == -1) + Fatal(ctx) << "fchmod failed: " << errno_string(); + + if (ftruncate(this->fd, filesize) == -1) + Fatal(ctx) << "ftruncate failed: " << errno_string(); + + output_tmpfile = (char *)save_string(ctx, tmpfile).data(); + +#ifdef __linux__ + fallocate(this->fd, 0, 0, filesize); +#endif + + this->buf = (u8 *)mmap(nullptr, filesize, PROT_READ | PROT_WRITE, + MAP_SHARED, this->fd, 0); + if (this->buf == MAP_FAILED) + Fatal(ctx) << path << ": mmap failed: " << errno_string(); + + mold::output_buffer_start = this->buf; + mold::output_buffer_end = this->buf + filesize; + } + + ~MemoryMappedOutputFile() { + if (fd2 != -1) + ::close(fd2); + } + + void close(Context &ctx) override { + Timer t(ctx, "close_file"); + + if (!this->is_unmapped) + munmap(this->buf, this->filesize); + + if (this->buf2.empty()) { + ::close(this->fd); + } else { + FILE *out = fdopen(this->fd, "w"); + fseek(out, 0, SEEK_END); + fwrite(&this->buf2[0], this->buf2.size(), 1, out); + fclose(out); + } + + // If an output file already exists, open a file and then remove it. + // This is the fastest way to unlink a file, as it does not make the + // system to immediately release disk blocks occupied by the file. + fd2 = ::open(this->path.c_str(), O_RDONLY); + if (fd2 != -1) + unlink(this->path.c_str()); + + if (rename(output_tmpfile, this->path.c_str()) == -1) + Fatal(ctx) << this->path << ": rename failed: " << errno_string(); + output_tmpfile = nullptr; + } + +private: + int fd2 = -1; +}; + +template +std::unique_ptr> +OutputFile::open(Context &ctx, std::string path, i64 filesize, int perm) { + Timer t(ctx, "open_file"); + + if (path.starts_with('/') && !ctx.arg.chroot.empty()) + path = ctx.arg.chroot + "/" + path_clean(path); + + bool is_special = false; + if (path == "-") { + is_special = true; + } else { + struct stat st; + if (stat(path.c_str(), &st) == 0 && (st.st_mode & S_IFMT) != S_IFREG) + is_special = true; + } + + OutputFile *file; + if (is_special) + file = new MallocOutputFile(ctx, path, filesize, perm); + else + file = new MemoryMappedOutputFile(ctx, path, filesize, perm); + +#ifdef MADV_HUGEPAGE + // Enable transparent huge page for an output memory-mapped file. + // On Linux, it has an effect only on tmpfs mounted with `huge=advise`, + // but it can make the linker ~10% faster. You can try it by creating + // a tmpfs with the following commands + // + // $ mkdir tmp + // $ sudo mount -t tmpfs -o size=2G,huge=advise none tmp + // + // and then specifying a path under the directory as an output file. + madvise(file->buf, filesize, MADV_HUGEPAGE); +#endif + + if (ctx.arg.filler != -1) + memset(file->buf, ctx.arg.filler, filesize); + return std::unique_ptr(file); +} + +// LockingOutputFile is similar to MemoryMappedOutputFile, but it doesn't +// rename output files and instead acquires file lock using flock(). +template +LockingOutputFile::LockingOutputFile(Context &ctx, std::string path, + int perm) + : OutputFile(path, 0, true) { + this->fd = ::open(path.c_str(), O_RDWR | O_CREAT, perm); + if (this->fd == -1) + Fatal(ctx) << "cannot open " << path << ": " << errno_string(); + flock(this->fd, LOCK_EX); + + // We may be overwriting to an existing debug info file. We want to + // make the file unusable so that gdb won't use it by accident until + // it's ready. + u8 buf[256] = {}; + (void)!!write(this->fd, buf, sizeof(buf)); +} + +template +void LockingOutputFile::resize(Context &ctx, i64 filesize) { + if (ftruncate(this->fd, filesize) == -1) + Fatal(ctx) << "ftruncate failed: " << errno_string(); + + this->buf = (u8 *)mmap(nullptr, filesize, PROT_READ | PROT_WRITE, + MAP_SHARED, this->fd, 0); + if (this->buf == MAP_FAILED) + Fatal(ctx) << this->path << ": mmap failed: " << errno_string(); + + this->filesize = filesize; + mold::output_buffer_start = this->buf; + mold::output_buffer_end = this->buf + filesize; +} + +template +void LockingOutputFile::close(Context &ctx) { + if (!this->is_unmapped) + munmap(this->buf, this->filesize); + + if (!this->buf2.empty()) { + FILE *out = fdopen(this->fd, "w"); + fseek(out, 0, SEEK_END); + fwrite(&this->buf2[0], this->buf2.size(), 1, out); + fclose(out); + } + + ::close(this->fd); +} + +using E = MOLD_TARGET; + +template class OutputFile; +template class LockingOutputFile; + +} // namespace mold diff --git a/src/output-file-win32.cc b/src/output-file-win32.cc new file mode 100644 index 00000000..68bd26c8 --- /dev/null +++ b/src/output-file-win32.cc @@ -0,0 +1,118 @@ +#include "mold.h" + +#include +#include +#include + +namespace mold { + +template +class MemoryMappedOutputFile : public OutputFile { +public: + MemoryMappedOutputFile(Context &ctx, std::string path, i64 filesize, int perm) + : OutputFile(path, filesize, true) { + // TODO: use intermediate temporary file for output. + DWORD attrs = (perm & 0200) ? FILE_ATTRIBUTE_NORMAL : FILE_ATTRIBUTE_READONLY; + + handle = CreateFileA(path.c_str(), GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + nullptr, CREATE_ALWAYS, attrs, nullptr); + if (handle == INVALID_HANDLE_VALUE) + Fatal(ctx) << "cannot open " << path << ": " << GetLastError(); + + HANDLE map = CreateFileMapping(handle, nullptr, PAGE_READWRITE, 0, + filesize, nullptr); + if (!map) + Fatal(ctx) << path << ": CreateFileMapping failed: " << GetLastError(); + + this->buf = (u8 *)MapViewOfFile(map, FILE_MAP_WRITE, 0, 0, filesize); + if (!this->buf) + Fatal(ctx) << path << ": MapViewOfFile failed: " << GetLastError(); + + CloseHandle(map); + + mold::output_buffer_start = this->buf; + mold::output_buffer_end = this->buf + filesize; + } + + ~MemoryMappedOutputFile() { + if (handle != INVALID_HANDLE_VALUE) + CloseHandle(handle); + } + + void close(Context &ctx) override { + Timer t(ctx, "close_file"); + + UnmapViewOfFile(this->buf); + + if (!this->buf2.empty()) { + if (SetFilePointer(handle, 0, nullptr, FILE_END) == INVALID_SET_FILE_POINTER) + Fatal(ctx) << this->path << ": SetFilePointer failed: " + << GetLastError(); + + DWORD written; + if (!WriteFile(handle, this->buf2.data(), this->buf2.size(), &written, + nullptr)) + Fatal(ctx) << this->path << ": WriteFile failed: " << GetLastError(); + } + + CloseHandle(handle); + handle = INVALID_HANDLE_VALUE; + } + +private: + HANDLE handle; +}; + +template +std::unique_ptr> +OutputFile::open(Context &ctx, std::string path, i64 filesize, int perm) { + Timer t(ctx, "open_file"); + + if (path.starts_with('/') && !ctx.arg.chroot.empty()) + path = ctx.arg.chroot + "/" + path_clean(path); + + bool is_special = false; + if (path == "-") { + is_special = true; + } else { + HANDLE h = CreateFileA(path.c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); + if (h != INVALID_HANDLE_VALUE) { + if (GetFileType(h) != FILE_TYPE_DISK) + is_special = true; + CloseHandle(h); + } + } + + OutputFile *file; + if (is_special) + file = new MallocOutputFile(ctx, path, filesize, perm); + else + file = new MemoryMappedOutputFile(ctx, path, filesize, perm); + + if (ctx.arg.filler != -1) + memset(file->buf, ctx.arg.filler, filesize); + return std::unique_ptr>(file); +} + +template +LockingOutputFile::LockingOutputFile(Context &ctx, std::string path, + int perm) + : OutputFile(path, 0, true) { + Fatal(ctx) << "LockingOutputFile is not supported on Windows"; +} + +template +void LockingOutputFile::resize(Context &ctx, i64 filesize) {} + +template +void LockingOutputFile::close(Context &ctx) {} + +using E = MOLD_TARGET; + +template class OutputFile; +template class LockingOutputFile; + +} // namespace mold diff --git a/elf/passes.cc b/src/passes.cc similarity index 69% rename from elf/passes.cc rename to src/passes.cc index 20f1a406..807bb2bc 100644 --- a/elf/passes.cc +++ b/src/passes.cc @@ -1,4 +1,5 @@ #include "mold.h" +#include "blake3.h" #include #include @@ -11,49 +12,47 @@ #include #include -namespace mold::elf { +namespace mold { -// Since elf_main is a template, we can't run it without a type parameter. -// We speculatively run elf_main with X86_64, and if the speculation was +// Since mold_main is a template, we can't run it without a type parameter. +// We speculatively run mold_main with X86_64, and if the speculation was // wrong, re-run it with an actual machine type. template int redo_main(Context &ctx, int argc, char **argv) { std::string_view target = ctx.arg.emulation; if (target == I386::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == ARM64::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == ARM32::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == RV64LE::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == RV64BE::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == RV32LE::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == RV32BE::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == PPC32::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == PPC64V1::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == PPC64V2::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == S390X::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == SPARC64::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == M68K::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == SH4::target_name) - return elf_main(argc, argv); - if (target == ALPHA::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == LOONGARCH32::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); if (target == LOONGARCH64::target_name) - return elf_main(argc, argv); + return mold_main(argc, argv); unreachable(); } @@ -67,12 +66,11 @@ void apply_exclude_libs(Context &ctx) { std::unordered_set set(ctx.arg.exclude_libs.begin(), ctx.arg.exclude_libs.end()); - for (ObjectFile *file : ctx.objs) { + for (ObjectFile *file : ctx.objs) if (!file->archive_name.empty()) if (set.contains("ALL") || set.contains(filepath(file->archive_name).filename().string())) file->exclude_libs = true; - } } template @@ -156,9 +154,11 @@ void create_synthetic_sections(Context &ctx) { ctx.verdef = push(new VerdefSection); if (ctx.arg.emit_relocs) ctx.eh_frame_reloc = push(new EhFrameRelocSection); + if (!ctx.arg.separate_debug_file.empty()) + ctx.gnu_debuglink = push(new GnuDebuglinkSection); if (ctx.arg.shared || !ctx.dsos.empty() || ctx.arg.pie) { - ctx.dynamic = push(new DynamicSection); + ctx.dynamic = push(new DynamicSection(ctx)); // If .dynamic exists, .dynsym and .dynstr must exist as well // since .dynamic refers to them. @@ -171,20 +171,21 @@ void create_synthetic_sections(Context &ctx) { ctx.note_package = push(new NotePackageSection); ctx.note_property = push(new NotePropertySection); + if (!ctx.arg.oformat_binary) { + ElfShdr shdr = {}; + shdr.sh_type = SHT_PROGBITS; + shdr.sh_flags = SHF_MERGE | SHF_STRINGS; + ctx.comment = MergedSection::get_instance(ctx, ".comment", shdr); + } + if constexpr (is_riscv) ctx.extra.riscv_attributes = push(new RiscvAttributesSection); if constexpr (is_ppc64v1) ctx.extra.opd = push(new PPC64OpdSection); - if constexpr (is_sparc) { - if (ctx.arg.is_static) - ctx.extra.tls_get_addr_sec = push(new SparcTlsGetAddrSection); - ctx.extra.tls_get_addr_sym = get_symbol(ctx, "__tls_get_addr"); - } - - if constexpr (is_alpha) - ctx.extra.got = push(new AlphaGotSection); + if constexpr (is_ppc64v2) + ctx.extra.save_restore = push(new PPC64SaveRestoreSection); } template @@ -197,6 +198,20 @@ static void mark_live_objects(Context &ctx) { if (sym->file) sym->file->is_alive = true; + if (!ctx.arg.undefined_glob.empty()) { + tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { + if (!file->is_alive) { + for (Symbol *sym : file->get_global_syms()) { + if (sym->file == file && ctx.arg.undefined_glob.find(sym->name())) { + file->is_alive = true; + sym->gc_root = true; + break; + } + } + } + }); + } + std::vector *> roots; for (InputFile *file : ctx.objs) @@ -209,233 +224,200 @@ static void mark_live_objects(Context &ctx) { tbb::parallel_for_each(roots, [&](InputFile *file, tbb::feeder *> &feeder) { - if (file->is_alive) - file->mark_live_objects(ctx, [&](InputFile *obj) { feeder.add(obj); }); + file->mark_live_objects(ctx, [&](InputFile *obj) { feeder.add(obj); }); }); } template -void do_resolve_symbols(Context &ctx) { - auto for_each_file = [&](std::function *)> fn) { - tbb::parallel_for_each(ctx.objs, fn); - tbb::parallel_for_each(ctx.dsos, fn); - }; - - // Due to legacy reasons, archive members will only get included in the final - // binary if they satisfy one of the undefined symbols in a non-archive object - // file. This is called archive extraction. In finalize_archive_extraction, - // this is processed as follows: - // - // 1. Do preliminary symbol resolution assuming all archive members - // are included. This matches the undefined symbols with ones to be - // extracted from archives. - // - // 2. Do a mark & sweep pass to eliminate unneeded archive members. - // - // Note that the symbol resolution inside finalize_archive_extraction uses a - // different rule. In order to prevent extracting archive members that can be - // satisfied by either non-archive object files or DSOs, the archive members - // are given a lower priority. This is not correct for the general case, where - // *extracted* object files have precedence over DSOs and even non-archive - // files that are passed earlier in the command line. Hence, the symbol - // resolution is thrown away once we determine which archive members to - // extract, and redone later with the formal rule. - { - Timer t(ctx, "extract_archive_members"); - - // Register symbols - for_each_file([&](InputFile *file) { file->resolve_symbols(ctx); }); - - // Mark reachable objects to decide which files to include into an output. - // This also merges symbol visibility. - mark_live_objects(ctx); - - // Cleanup. The rule used for archive extraction isn't accurate for the - // general case of symbol extraction, so reset the resolution to be redone - // later. - for_each_file([](InputFile *file) { file->clear_symbols(); }); - - // Now that the symbol references are gone, remove the eliminated files from - // the file list. - std::erase_if(ctx.objs, [](InputFile *file) { return !file->is_alive; }); - std::erase_if(ctx.dsos, [](InputFile *file) { return !file->is_alive; }); - } - - // COMDAT elimination needs to happen exactly here. - // - // It needs to be after archive extraction, otherwise we might assign COMDAT - // leader to an archive member that is not supposed to be extracted. - // - // It needs to happen before symbol resolution, otherwise we could eliminate - // a symbol that is already resolved to and cause dangling references. - { - Timer t(ctx, "eliminate_comdats"); - - tbb::parallel_for_each(ctx.objs, [](ObjectFile *file) { - for (ComdatGroupRef &ref : file->comdat_groups) - update_minimum(ref.group->owner, file->priority); - }); - - tbb::parallel_for_each(ctx.objs, [](ObjectFile *file) { - for (ComdatGroupRef &ref : file->comdat_groups) - if (ref.group->owner != file->priority) - for (u32 i : ref.members) - if (file->sections[i]) - file->sections[i]->kill(); - }); - } +static void clear_symbols(Context &ctx) { + std::vector *> files; + append(files, ctx.objs); + append(files, ctx.dsos); - // Since we have turned on object files live bits, their symbols - // may now have higher priority than before. So run the symbol - // resolution pass again to get the final resolution result. - for_each_file([&](InputFile *file) { file->resolve_symbols(ctx); }); + tbb::parallel_for_each(files, [](InputFile *file) { + for (Symbol *sym : file->get_global_syms()) { + if (__atomic_load_n(&sym->file, __ATOMIC_ACQUIRE) == file) { + sym->origin = 0; + sym->value = -1; + sym->sym_idx = -1; + sym->ver_idx = VER_NDX_UNSPECIFIED; + sym->is_weak = false; + sym->is_imported = false; + sym->is_exported = false; + __atomic_store_n(&sym->file, nullptr, __ATOMIC_RELEASE); + } + } + }); } template void resolve_symbols(Context &ctx) { Timer t(ctx, "resolve_symbols"); - std::vector *> objs = ctx.objs; - std::vector *> dsos = ctx.dsos; + std::vector *> files; + append(files, ctx.objs); + append(files, ctx.dsos); - do_resolve_symbols(ctx); + for (;;) { + // Call resolve_symbols() to find the most appropriate file for each + // symbol. And then mark reachable objects to decide which files to + // include into an output. + tbb::parallel_for_each(files, [&](InputFile *file) { + file->resolve_symbols(ctx); + }); - if (ctx.has_lto_object) { - // Do link-time optimization. We pass all IR object files to the - // compiler backend to compile them into a few ELF object files. - // - // The compiler backend needs to know how symbols are resolved, - // so compute symbol visibility, import/export bits, etc early. mark_live_objects(ctx); - apply_version_script(ctx); - parse_symbol_version(ctx); - compute_import_export(ctx); - // Do LTO. It compiles IR object files into a few big ELF files. - std::vector *> lto_objs = do_lto(ctx); + // Now that we know the exact set of input files that are to be + // included in the output file, we want to redo symbol resolution. + // This is because symbols defined by object files in archive files + // may have risen as a result of mark_live_objects(). + // + // To redo symbol resolution, we want to clear the state first. + clear_symbols(ctx); - // do_resolve_symbols() have removed unreferenced files. Restore the - // original files here because some of them may have to be resurrected - // because they are referenced by the ELF files returned from do_lto(). - ctx.objs = objs; - ctx.dsos = dsos; + // COMDAT elimination needs to happen exactly here. + // + // It needs to be after archive extraction, otherwise we might + // assign COMDAT leader to an archive member that is not supposed to + // be extracted. + // + // It needs to happen before the final symbol resolution, otherwise + // we could eliminate a symbol that is already resolved to and cause + // dangling references. + tbb::parallel_for_each(ctx.objs, [](ObjectFile *file) { + if (file->is_alive) + for (ComdatGroupRef &ref : file->comdat_groups) + update_minimum(ref.group->owner, file->priority); + }); - append(ctx.objs, lto_objs); + tbb::parallel_for_each(ctx.objs, [](ObjectFile *file) { + if (file->is_alive) + for (ComdatGroupRef &ref : file->comdat_groups) + if (ref.group->owner != file->priority) + for (u32 i : ref.members) + if (InputSection *isec = file->sections[i].get()) + isec->is_alive = false; + }); - // Redo name resolution from scratch. - tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { - file->clear_symbols(); + // Redo symbol resolution + tbb::parallel_for_each(files, [&](InputFile *file) { + if (file->is_alive) + file->resolve_symbols(ctx); }); + // Symbols with hidden visibility need to be resolved within the + // output file. If a hidden symbol was resolved to a DSO, we'll redo + // symbol resolution from scratch with the flag to skip that symbol + // next time. This should be rare. + std::atomic_bool flag = false; + tbb::parallel_for_each(ctx.dsos, [&](SharedFile *file) { - file->clear_symbols(); + if (file->is_alive) { + for (Symbol *sym : file->symbols) { + if (sym->file == file && sym->visibility == STV_HIDDEN) { + sym->skip_dso = true; + flag = true; + } + } + } }); - // Remove IR object files. - for (ObjectFile *file : ctx.objs) - if (file->is_lto_obj) - file->is_alive = false; - - std::erase_if(ctx.objs, [](ObjectFile *file) { return file->is_lto_obj; }); + if (!flag) + return; - do_resolve_symbols(ctx); + clear_symbols(ctx); + resolve_symbols(ctx); } } -// .eh_frame sections are parsed and regenerated by the linker for the purpose -// of deduplication and garbage collection. As such, the input sections should -// not be copied over. -// -// However, in very rare cases (e.g. GCC CRT compiled with LTO) we might need -// to resolve cross-object .eh_frame section references (they only point to -// begin or end and don't depend on the actual section contents). -// Therefore, the sections are "killed" after symbol resolution as a separate -// pass. +// Do link-time optimization. We pass all IR object files to the compiler +// backend to compile them into a few ELF object files. template -void kill_eh_frame_sections(Context &ctx) { - Timer t(ctx, "kill_eh_frame_sections"); +void do_lto(Context &ctx) { + Timer t(ctx, "do_lto"); + + // The compiler backend needs to know how symbols are resolved, so + // compute symbol visibility, import/export bits, etc early. + mark_live_objects(ctx); + apply_version_script(ctx); + parse_symbol_version(ctx); + compute_import_export(ctx); + + // Invoke the LTO plugin. This step compiles IR object files into a few + // big ELF files. + std::vector *> lto_objs = run_lto_plugin(ctx); + append(ctx.objs, lto_objs); + + // Redo name resolution. + clear_symbols(ctx); + // Remove IR object files. for (ObjectFile *file : ctx.objs) - if (file->eh_frame_section) - file->eh_frame_section->is_alive = false; + if (file->is_lto_obj) + file->is_alive = false; + + std::erase_if(ctx.objs, [](ObjectFile *file) { return file->is_lto_obj; }); + + resolve_symbols(ctx); } template -void resolve_section_pieces(Context &ctx) { - Timer t(ctx, "resolve_section_pieces"); +void parse_eh_frame_sections(Context &ctx) { + Timer t(ctx, "parse_eh_frame_sections"); tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { - file->initialize_mergeable_sections(ctx); - }); + file->parse_ehframe(ctx); - tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { - file->resolve_section_pieces(ctx); + for (InputSection *isec : file->eh_frame_sections) + isec->is_alive = false; }); } template -void convert_common_symbols(Context &ctx) { - Timer t(ctx, "convert_common_symbols"); +void create_merged_sections(Context &ctx) { + Timer t(ctx, "create_merged_sections"); + // Convert InputSections to MergeableSections. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { - file->convert_common_symbols(ctx); + file->convert_mergeable_sections(ctx); }); -} - -template -static std::string get_cmdline_args(Context &ctx) { - std::stringstream ss; - ss << ctx.cmdline_args[1]; - for (i64 i = 2; i < ctx.cmdline_args.size(); i++) - ss << " " << ctx.cmdline_args[i]; - return ss.str(); -} -template -void add_comment_string(Context &ctx, std::string str) { - MergedSection *sec = - MergedSection::get_instance(ctx, ".comment", SHT_PROGBITS, - SHF_MERGE | SHF_STRINGS, 1, 1); + tbb::parallel_for_each(ctx.merged_sections, + [&](std::unique_ptr> &sec) { + if (sec->shdr.sh_flags & SHF_ALLOC) + sec->resolve(ctx); + }); - std::string_view buf = save_string(ctx, str); - std::string_view data(buf.data(), buf.size() + 1); - sec->insert(ctx, data, hash_string(data), 0); + tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { + file->reattach_section_pieces(ctx); + }); } template -void compute_merged_section_sizes(Context &ctx) { - Timer t(ctx, "compute_merged_section_sizes"); - - // Add an identification string to .comment. - if (!ctx.arg.oformat_binary) - add_comment_string(ctx, mold_version); - - // Embed command line arguments for debugging. - if (char *env = getenv("MOLD_DEBUG"); env && env[0]) - add_comment_string(ctx, "mold command line: " + get_cmdline_args(ctx)); +void convert_common_symbols(Context &ctx) { + Timer t(ctx, "convert_common_symbols"); - tbb::parallel_for_each(ctx.merged_sections, - [&](std::unique_ptr> &sec) { - sec->assign_offsets(ctx); + tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { + file->convert_common_symbols(ctx); }); } -template -static std::vector> split(std::vector &input, i64 unit) { - std::span span(input); - std::vector> vec; - - while (span.size() >= unit) { - vec.push_back(span.subspan(0, unit)); - span = span.subspan(unit); +template +static bool has_ctors_and_init_array(Context &ctx) { + bool x = false; + bool y = false; + for (ObjectFile *file : ctx.objs) { + x |= file->has_ctors; + y |= file->has_init_array; } - if (!span.empty()) - vec.push_back(span); - return vec; + return x && y; } template static u64 canonicalize_type(std::string_view name, u64 type) { + // Some old assemblers don't recognize these section names and + // create them as SHT_PROGBITS. if (type == SHT_PROGBITS) { if (name == ".init_array" || name.starts_with(".init_array.")) return SHT_INIT_ARRAY; @@ -443,9 +425,16 @@ static u64 canonicalize_type(std::string_view name, u64 type) { return SHT_FINI_ARRAY; } + // The x86-64 psABI defines SHT_X86_64_UNWIND for .eh_frame, allowing + // the linker to recognize the section not by name but by section type. + // However, that spec change was generally considered a mistake; it has + // just complicated the situation. As a result, .eh_frame on x86-64 may + // be either SHT_PROGBITS or SHT_X86_64_UNWIND. We use SHT_PROGBITS + // consistently. if constexpr (is_x86_64) if (type == SHT_X86_64_UNWIND) return SHT_PROGBITS; + return type; } @@ -453,11 +442,16 @@ struct OutputSectionKey { bool operator==(const OutputSectionKey &) const = default; std::string_view name; u64 type; - u64 flags; + + struct Hash { + size_t operator()(const OutputSectionKey &k) const { + return combine_hash(hash_string(k.name), std::hash{}(k.type)); + } + }; }; template -std::string_view +static std::string_view get_output_name(Context &ctx, std::string_view name, u64 flags) { if (ctx.arg.relocatable && !ctx.arg.relocatable_merge_sections) return name; @@ -473,13 +467,6 @@ get_output_name(Context &ctx, std::string_view name, u64 flags) { return ".ARM.extab"; } - if constexpr (is_alpha) { - if (name.starts_with(".sdata.")) - return ".sdata"; - if (name.starts_with(".sbss.")) - return ".sbss"; - } - if (ctx.arg.z_keep_text_section_prefix) { static std::string_view prefixes[] = { ".text.hot.", ".text.unknown.", ".text.unlikely.", ".text.startup.", @@ -497,6 +484,7 @@ get_output_name(Context &ctx, std::string_view name, u64 flags) { ".text.", ".data.rel.ro.", ".data.", ".rodata.", ".bss.rel.ro.", ".bss.", ".init_array.", ".fini_array.", ".tbss.", ".tdata.", ".gcc_except_table.", ".ctors.", ".dtors.", ".gnu.warning.", ".openbsd.randomdata.", + ".sdata.", ".sbss.", ".srodata", }; for (std::string_view prefix : prefixes) { @@ -508,43 +496,54 @@ get_output_name(Context &ctx, std::string_view name, u64 flags) { return name; } -// Returns true if a given input section is a .ctors/.dtors that -// should be put into .init_array/.fini_array. -// -// CRT object files contain .ctors/.dtors sections without any -// relocations. They contain sentinel values, 0 and -1, to mark the -// beginning and the end of the initializer/finalizer pointer arrays. We -// do not place them into .init_array/.fini_array because such invalid -// pointer values would simply make the program to crash. -template -static bool is_ctors_in_init_array(Context &ctx, InputSection &isec) { - std::string_view name = isec.name(); - return ctx.has_init_array && !isec.get_rels(ctx).empty() && - (name == ".ctors" || name.starts_with(".ctors.") || - name == ".dtors" || name.starts_with(".dtors.")); -} - template static OutputSectionKey -get_output_section_key(Context &ctx, InputSection &isec) { - if (is_ctors_in_init_array(ctx, isec)) { - if (isec.name().starts_with(".ctors")) - return {".init_array", SHT_INIT_ARRAY, SHF_ALLOC | SHF_WRITE}; - return {".fini_array", SHT_FINI_ARRAY, SHF_ALLOC | SHF_WRITE}; +get_output_section_key(Context &ctx, InputSection &isec, + bool ctors_in_init_array) { + // If .init_array/.fini_array exist, .ctors/.dtors must be merged + // with them. + // + // CRT object files contain .ctors/.dtors sections without any + // relocations. They contain sentinel values, 0 and -1, to mark the + // beginning and the end of the initializer/finalizer pointer arrays. + // We do not place them into .init_array/.fini_array because such + // invalid pointer values would simply make the program to crash. + if (ctors_in_init_array && !isec.get_rels(ctx).empty()) { + std::string_view name = isec.name(); + if (name == ".ctors" || name.starts_with(".ctors.")) + return {".init_array", SHT_INIT_ARRAY}; + if (name == ".dtors" || name.starts_with(".dtors.")) + return {".fini_array", SHT_FINI_ARRAY}; } const ElfShdr &shdr = isec.shdr(); std::string_view name = get_output_name(ctx, isec.name(), shdr.sh_flags); u64 type = canonicalize_type(name, shdr.sh_type); - u64 flags = shdr.sh_flags & ~(u64)(SHF_COMPRESSED | SHF_GROUP | SHF_GNU_RETAIN); + return {name, type}; +} - // .init_array is usually writable. We don't want to create multiple - // .init_array output sections, so make it always writable. - // So is .fini_array. - if (type == SHT_INIT_ARRAY || type == SHT_FINI_ARRAY) - flags |= SHF_WRITE; +template +static bool is_relro(OutputSection &osec) { + // PT_GNU_RELRO segment is a security mechanism to make more pages + // read-only than we could have done without it. + // + // Traditionally, sections are either read-only or read-write. If a + // section contains dynamic relocations, it must have been put into a + // read-write segment so that the program loader can mutate its + // contents in memory, even if no one will write to it at runtime. + // + // RELRO segment allows us to make such pages writable only when a + // program is being loaded. After that, the page becomes read-only. + // + // Some sections, such as .init, .fini, .got, .dynamic, contain + // dynamic relocations but doesn't have to be writable at runtime, + // so they are put into a RELRO segment. + u32 type = osec.shdr.sh_type; + u32 flags = osec.shdr.sh_flags; - return {name, type, flags}; + return osec.name == ".toc" || osec.name.ends_with(".rel.ro") || + type == SHT_INIT_ARRAY || type == SHT_FINI_ARRAY || + type == SHT_PREINIT_ARRAY || (flags & SHF_TLS); } // Create output sections for input sections. @@ -552,25 +551,18 @@ template void create_output_sections(Context &ctx) { Timer t(ctx, "create_output_sections"); - struct Hash { - size_t operator()(const OutputSectionKey &k) const { - u64 h = hash_string(k.name); - h = combine_hash(h, std::hash{}(k.type)); - h = combine_hash(h, std::hash{}(k.flags)); - return h; - } - }; - - std::unordered_map *, Hash> map; + using MapType = std::unordered_map *, + OutputSectionKey::Hash>; + MapType map; std::shared_mutex mu; - i64 size = ctx.osec_pool.size(); + bool ctors_in_init_array = has_ctors_and_init_array(ctx); // Instantiate output sections tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { // Make a per-thread cache of the main map to avoid lock contention. // It makes a noticeable difference if we have millions of input sections. - decltype(map) cache; + MapType cache; { std::shared_lock lock(mu); cache = map; @@ -581,46 +573,57 @@ void create_output_sections(Context &ctx) { continue; const ElfShdr &shdr = isec->shdr(); - if (ctx.arg.relocatable && (shdr.sh_flags & SHF_GROUP)) { - OutputSection *osec = - new OutputSection(ctx, isec->name(), shdr.sh_type, shdr.sh_flags); + u32 sh_flags = shdr.sh_flags & ~SHF_MERGE & ~SHF_STRINGS & + ~SHF_COMPRESSED & ~SHF_GNU_RETAIN; + + if (ctx.arg.relocatable && (sh_flags & SHF_GROUP)) { + OutputSection *osec = new OutputSection(isec->name(), shdr.sh_type); + osec->sh_flags = sh_flags; isec->output_section = osec; ctx.osec_pool.emplace_back(osec); continue; } - OutputSectionKey key = get_output_section_key(ctx, *isec); + auto get_or_insert = [&] { + OutputSectionKey key = + get_output_section_key(ctx, *isec, ctors_in_init_array); - if (auto it = cache.find(key); it != cache.end()) { - isec->output_section = it->second; - continue; - } + if (auto it = cache.find(key); it != cache.end()) + return it->second; - auto get_or_insert = [&] { { std::shared_lock lock(mu); - if (auto it = map.find(key); it != map.end()) + if (auto it = map.find(key); it != map.end()) { + cache.insert({key, it->second}); return it->second; + } } std::unique_ptr> osec = - std::make_unique>(ctx, key.name, key.type, key.flags); + std::make_unique>(key.name, key.type); std::unique_lock lock(mu); auto [it, inserted] = map.insert({key, osec.get()}); - OutputSection *ret = it->second; if (inserted) ctx.osec_pool.emplace_back(std::move(osec)); - return ret; + cache.insert({key, it->second}); + return it->second; }; OutputSection *osec = get_or_insert(); + sh_flags &= ~SHF_GROUP; + if ((osec->sh_flags & sh_flags) != sh_flags) + osec->sh_flags |= sh_flags; isec->output_section = osec; - cache.insert({key, osec}); } }); + for (std::unique_ptr> &osec : ctx.osec_pool) { + osec->shdr.sh_flags = osec->sh_flags; + osec->is_relro = is_relro(*osec); + } + // Add input sections to output sections std::vector *> chunks; for (i64 i = size; i < ctx.osec_pool.size(); i++) @@ -633,8 +636,7 @@ void create_output_sections(Context &ctx) { // Add output sections and mergeable sections to ctx.chunks for (std::unique_ptr> &osec : ctx.merged_sections) - if (osec->shdr.sh_size) - chunks.push_back(osec.get()); + chunks.push_back(osec.get()); // Sections are added to the section lists in an arbitrary order // because they are created in parallel. Sort them to to make the @@ -777,9 +779,11 @@ void add_synthetic_symbols(Context &ctx) { if constexpr (supports_tlsdesc) ctx._TLS_MODULE_BASE_ = add("_TLS_MODULE_BASE_", STT_TLS); - if constexpr (is_riscv) - if (!ctx.arg.shared) - ctx.__global_pointer = add("__global_pointer$"); + if constexpr (is_riscv) { + ctx.__global_pointer = add("__global_pointer$"); + if (ctx.dynamic && !ctx.arg.shared) + ctx.__global_pointer->is_exported = true; + } if constexpr (is_arm32) { ctx.__exidx_start = add("__exidx_start"); @@ -792,18 +796,29 @@ void add_synthetic_symbols(Context &ctx) { if constexpr (is_ppc32) ctx.extra._SDA_BASE_ = add("_SDA_BASE_"); + auto add_start_stop = [&](std::string s) { + add(save_string(ctx, s)); + if (ctx.arg.z_start_stop_visibility_protected) + get_symbol(ctx, save_string(ctx, s))->is_exported = true; + }; + for (Chunk *chunk : ctx.chunks) { if (std::optional name = get_start_stop_name(ctx, *chunk)) { - add(save_string(ctx, "__start_" + *name)); - add(save_string(ctx, "__stop_" + *name)); + add_start_stop("__start_" + *name); + add_start_stop("__stop_" + *name); if (ctx.arg.physical_image_base) { - add(save_string(ctx, "__phys_start_" + *name)); - add(save_string(ctx, "__phys_stop_" + *name)); + add_start_stop("__phys_start_" + *name); + add_start_stop("__phys_stop_" + *name); } } } + if constexpr (is_ppc64v2) + for (std::pair p : ppc64_save_restore_insns) + if (std::string_view label = p.first; !label.empty()) + add(label); + obj.elf_syms = ctx.internal_esyms; obj.has_symver.resize(ctx.internal_esyms.size() - 1); @@ -811,9 +826,12 @@ void add_synthetic_symbols(Context &ctx) { // Make all synthetic symbols relative ones by associating them to // a dummy output section. - for (Symbol *sym : obj.symbols) - if (sym->file == &obj) + for (Symbol *sym : obj.symbols) { + if (sym->file == &obj) { sym->set_output_section(ctx.symtab); + sym->is_imported = false; + } + } // Handle --defsym symbols. for (i64 i = 0; i < ctx.arg.defsyms.size(); i++) { @@ -840,6 +858,15 @@ void add_synthetic_symbols(Context &ctx) { } } +template +void apply_section_align(Context &ctx) { + for (Chunk *chunk : ctx.chunks) + if (OutputSection *osec = chunk->to_osec()) + if (auto it = ctx.arg.section_align.find(osec->name); + it != ctx.arg.section_align.end()) + osec->shdr.sh_addralign = it->second; +} + template void check_cet_errors(Context &ctx) { bool warning = (ctx.arg.z_cet_report == CET_REPORT_WARNING); @@ -879,7 +906,7 @@ void check_cet_errors(Context &ctx) { template void print_dependencies(Context &ctx) { - SyncOut(ctx) << + Out(ctx) << R"(# This is an output of the mold linker's --print-dependencies option. # # Each line consists of 4 fields, , , and @@ -893,13 +920,13 @@ R"(# This is an output of the mold linker's --print-dependencies option. auto println = [&](auto &src, Symbol &sym, ElfSym &esym) { if (InputSection *isec = sym.get_input_section()) - SyncOut(ctx) << src << "\t" << *isec - << "\t" << (esym.is_weak() ? 'w' : 'u') - << "\t" << sym; + Out(ctx) << src << "\t" << *isec + << "\t" << (esym.is_weak() ? 'w' : 'u') + << "\t" << sym; else - SyncOut(ctx) << src << "\t" << *sym.file - << "\t" << (esym.is_weak() ? 'w' : 'u') - << "\t" << sym; + Out(ctx) << src << "\t" << *sym.file + << "\t" << (esym.is_weak() ? 'w' : 'u') + << "\t" << sym; }; for (ObjectFile *file : ctx.objs) { @@ -910,7 +937,7 @@ R"(# This is an output of the mold linker's --print-dependencies option. std::unordered_set visited; for (const ElfRel &r : isec->get_rels(ctx)) { - if (r.r_type == R_NONE) + if (r.r_type == R_NONE || file->elf_syms.size() <= r.r_sym) continue; ElfSym &esym = file->elf_syms[r.r_sym]; @@ -966,20 +993,18 @@ void write_repro_file(Context &ctx) { if (!tar) Fatal(ctx) << "cannot open " << path << ": " << errno_string(); - tar->append("response.txt", save_string(ctx, create_response_file(ctx))); - tar->append("version.txt", save_string(ctx, mold_version + "\n")); + tar->append("response.txt", create_response_file(ctx)); + tar->append("version.txt", get_mold_version() + "\n"); - std::unordered_set seen; - for (std::unique_ptr>> &mf : ctx.mf_pool) { - if (!mf->parent) { - std::string path = to_abs_path(mf->name).string(); - if (seen.insert(path).second) { - // We reopen a file because we may have modified the contents of mf - // in memory, which is mapped with PROT_WRITE and MAP_PRIVATE. - MappedFile> *mf2 = MappedFile>::must_open(ctx, path); - tar->append(path, mf2->get_contents()); - mf2->unmap(); - } + std::unordered_set seen; + + for (std::unique_ptr &mf : ctx.mf_pool) { + if (!mf->parent && seen.insert(mf->name).second) { + // We reopen a file because we may have modified the contents of mf + // in memory, which is mapped with PROT_WRITE and MAP_PRIVATE. + MappedFile *mf2 = must_open_file(ctx, mf->name); + tar->append(to_abs_path(mf->name).string(), mf2->get_contents()); + mf2->unmap(); } } } @@ -1014,6 +1039,50 @@ void check_duplicate_symbols(Context &ctx) { ctx.checkpoint(); } +// If --no-allow-shlib-undefined is specified, we report errors on +// unresolved symbols in shared libraries. This is useful when you are +// creating a final executable and want to make sure that all symbols +// including ones in shared libraries have been resolved. +// +// If you do not pass --no-allow-shlib-undefined, undefined symbols in +// shared libraries will be reported as run-time error by the dynamic +// linker. +template +void check_shlib_undefined(Context &ctx) { + Timer t(ctx, "check_shlib_undefined"); + + auto is_sparc_register = [](const ElfSym &esym) { + // Dynamic symbol table for SPARC contains bogus entries which + // we need to ignore + if constexpr (is_sparc) + return esym.st_type == STT_SPARC_REGISTER; + return false; + }; + + // Obtain a list of known shared library names. + std::unordered_set sonames; + for (SharedFile *file : ctx.dsos) + sonames.insert(file->soname); + + tbb::parallel_for_each(ctx.dsos, [&](SharedFile *file) { + // Skip the file if it depends on a file that we know nothing about. + // This is because missing symbols may be provided by that unknown file. + for (std::string_view needed : file->get_dt_needed(ctx)) + if (sonames.count(needed) == 0) + return; + + // Check if all undefined symbols have been resolved. + for (i64 i = 0; i < file->elf_syms.size(); i++) { + const ElfSym &esym = file->elf_syms[i]; + Symbol &sym = *file->symbols[i]; + if (esym.is_undef() && !esym.is_weak() && !sym.file && + !is_sparc_register(esym)) + Error(ctx) << *file << ": --no-allow-shlib-undefined: undefined symbol: " + << sym; + } + }); +} + template void check_symbol_types(Context &ctx) { Timer t(ctx, "check_symbol_types"); @@ -1022,6 +1091,14 @@ void check_symbol_types(Context &ctx) { append(files, ctx.objs); append(files, ctx.dsos); + auto canonicalize = [](u32 ty) -> u32 { + if (ty == STT_GNU_IFUNC) + return STT_FUNC; + if (ty == STT_COMMON) + return STT_OBJECT; + return ty; + }; + tbb::parallel_for_each(files.begin(), files.end(), [&](InputFile *file) { for (i64 i = file->first_global; i < file->elf_syms.size(); i++) { Symbol &sym = *file->symbols[i]; @@ -1031,15 +1108,14 @@ void check_symbol_types(Context &ctx) { const ElfSym &esym1 = sym.esym(); const ElfSym &esym2 = file->elf_syms[i]; - u32 ty1 = (esym1.st_type == STT_GNU_IFUNC) ? (u32)STT_FUNC : esym1.st_type; - u32 ty2 = (esym2.st_type == STT_GNU_IFUNC) ? (u32)STT_FUNC : esym2.st_type; - - if (ty1 != STT_NOTYPE && ty2 != STT_NOTYPE && ty1 != ty2) + if (esym1.st_type != STT_NOTYPE && esym2.st_type != STT_NOTYPE && + canonicalize(esym1.st_type) != canonicalize(esym2.st_type)) { Warn(ctx) << "symbol type mismatch: " << sym << '\n' << ">>> defined in " << *sym.file << " as " << stt_to_string(esym1.st_type) << '\n' << ">>> defined in " << *file << " as " << stt_to_string(esym2.st_type); + } } }); } @@ -1080,6 +1156,11 @@ template void sort_init_fini(Context &ctx) { Timer t(ctx, "sort_init_fini"); + struct Entry { + InputSection *sect; + i64 prio; + }; + for (Chunk *chunk : ctx.chunks) { if (OutputSection *osec = chunk->to_osec()) { if (osec->name == ".init_array" || osec->name == ".preinit_array" || @@ -1087,19 +1168,20 @@ void sort_init_fini(Context &ctx) { if (ctx.arg.shuffle_sections == SHUFFLE_SECTIONS_REVERSE) std::reverse(osec->members.begin(), osec->members.end()); - std::unordered_map *, i64> map; + std::vector vec; for (InputSection *isec : osec->members) { std::string_view name = isec->name(); if (name.starts_with(".ctors") || name.starts_with(".dtors")) - map.insert({isec, 65535 - get_ctor_dtor_priority(isec)}); + vec.push_back({isec, 65535 - get_ctor_dtor_priority(isec)}); else - map.insert({isec, get_init_fini_priority(isec)}); + vec.push_back({isec, get_init_fini_priority(isec)}); } - sort(osec->members, [&](InputSection *a, InputSection *b) { - return map[a] < map[b]; - }); + sort(vec, [&](const Entry &a, const Entry &b) { return a.prio < b.prio; }); + + for (i64 i = 0; i < vec.size(); i++) + osec->members[i] = vec[i].sect; } } } @@ -1109,19 +1191,25 @@ template void sort_ctor_dtor(Context &ctx) { Timer t(ctx, "sort_ctor_dtor"); + struct Entry { + InputSection *sect; + i64 prio; + }; + for (Chunk *chunk : ctx.chunks) { if (OutputSection *osec = chunk->to_osec()) { if (osec->name == ".ctors" || osec->name == ".dtors") { if (ctx.arg.shuffle_sections != SHUFFLE_SECTIONS_REVERSE) std::reverse(osec->members.begin(), osec->members.end()); - std::unordered_map *, i64> map; + std::vector vec; for (InputSection *isec : osec->members) - map.insert({isec, get_ctor_dtor_priority(isec)}); + vec.push_back({isec, get_ctor_dtor_priority(isec)}); - sort(osec->members, [&](InputSection *a, InputSection *b) { - return map[a] < map[b]; - }); + sort(vec, [&](const Entry &a, const Entry &b) { return a.prio < b.prio; }); + + for (i64 i = 0; i < vec.size(); i++) + osec->members[i] = vec[i].sect; } } } @@ -1141,29 +1229,33 @@ template void fixup_ctors_in_init_array(Context &ctx) { Timer t(ctx, "fixup_ctors_in_init_array"); - for (Chunk *chunk : ctx.chunks) { - if (OutputSection *osec = chunk->to_osec()) { - if (osec->name == ".init_array" || osec->name == ".fini_array") { - for (InputSection *isec : osec->members) { - if (isec->name().starts_with(".ctors") || - isec->name().starts_with(".dtors")) { - if (isec->sh_size % sizeof(Word)) { - Error(ctx) << *isec << ": section corrupted"; - continue; - } - - u8 *buf = (u8 *)isec->contents.data(); - std::reverse((Word *)buf, (Word *)(buf + isec->sh_size)); - - std::span> rels = isec->get_rels(ctx); - for (ElfRel &r : rels) - r.r_offset = isec->sh_size - r.r_offset - sizeof(Word); - std::reverse(rels.begin(), rels.end()); - } + auto fixup = [&](OutputSection &osec) { + for (InputSection *isec : osec.members) { + if (isec->name().starts_with(".ctors") || + isec->name().starts_with(".dtors")) { + if (isec->sh_size % sizeof(Word)) { + Error(ctx) << *isec << ": section corrupted"; + continue; } + + u8 *buf = (u8 *)isec->contents.data(); + std::reverse((Word *)buf, (Word *)(buf + isec->sh_size)); + + std::span> rels = isec->get_rels(ctx); + for (ElfRel &r : rels) + r.r_offset = isec->sh_size - r.r_offset - sizeof(Word); + std::reverse(rels.begin(), rels.end()); } } - } + }; + + if (Chunk *chunk = find_chunk(ctx, ".init_array")) + if (OutputSection *osec = chunk->to_osec()) + fixup(*osec); + + if (Chunk *chunk = find_chunk(ctx, ".fini_array")) + if (OutputSection *osec = chunk->to_osec()) + fixup(*osec); } template @@ -1232,83 +1324,31 @@ template void compute_section_sizes(Context &ctx) { Timer t(ctx, "compute_section_sizes"); - struct Group { - i64 size = 0; - i64 p2align = 0; - i64 offset = 0; - std::span *> members; - }; - - tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { - OutputSection *osec = chunk->to_osec(); - if (!osec) - return; - - // This pattern will be processed in the next loop. - if constexpr (needs_thunk) - if ((osec->shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable) - return; - - // Since one output section may contain millions of input sections, - // we first split input sections into groups and assign offsets to - // groups. - std::vector groups; - constexpr i64 group_size = 10000; - - for (std::span *> span : split(osec->members, group_size)) - groups.push_back(Group{.members = span}); + if constexpr (needs_thunk) { + // We cannot use parallel-for for compute_section_size() which may + // call create_range_extension_thunks() because that function is + // not thread-safe. + for (Chunk *chunk : ctx.chunks) + if (chunk->shdr.sh_flags & SHF_EXECINSTR) + chunk->compute_section_size(ctx); - tbb::parallel_for_each(groups, [](Group &group) { - for (InputSection *isec : group.members) { - group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size; - group.p2align = std::max(group.p2align, isec->p2align); - } + tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { + if (!(chunk->shdr.sh_flags & SHF_EXECINSTR)) + chunk->compute_section_size(ctx); }); - - ElfShdr &shdr = osec->shdr; - shdr.sh_size = 0; - - for (i64 i = 0; i < groups.size(); i++) { - shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align); - groups[i].offset = shdr.sh_size; - shdr.sh_size += groups[i].size; - shdr.sh_addralign = std::max(shdr.sh_addralign, 1 << groups[i].p2align); - } - - // Assign offsets to input sections. - tbb::parallel_for_each(groups, [](Group &group) { - i64 offset = group.offset; - for (InputSection *isec : group.members) { - offset = align_to(offset, 1 << isec->p2align); - isec->offset = offset; - offset += isec->sh_size; - } + } else { + tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { + chunk->compute_section_size(ctx); }); - }); - - // On ARM32 or ARM64, we may need to create so-called "range extension - // thunks" to extend branch instructions reach, as they can jump only - // to ±16 MiB or ±128 MiB, respecitvely. - // - // In the following loop, We compute the sizes of sections while - // inserting thunks. This pass cannot be parallelized. That is, - // create_range_extension_thunks is parallelized internally, but the - // function itself is not thread-safe. - if constexpr (needs_thunk) { - Timer t2(ctx, "create_range_extension_thunks"); - - if (!ctx.arg.relocatable) - for (Chunk *chunk : ctx.chunks) - if (OutputSection *osec = chunk->to_osec()) - if (osec->shdr.sh_flags & SHF_EXECINSTR) - osec->create_range_extension_thunks(ctx); } } // Find all unresolved symbols and attach them to the most appropriate files. -// Note that even a symbol that will be reported as an undefined symbol will -// get an owner file in this function. Such symbol will be reported by -// ObjectFile::scan_relocations(). +// +// Note that even a symbol that will be reported as an undefined symbol +// will get an owner file in this function. Such symbol will be reported +// by ObjectFile::scan_relocations(). This is because we want to report +// errors only on symbols that are actually referenced. template void claim_unresolved_symbols(Context &ctx) { Timer t(ctx, "claim_unresolved_symbols"); @@ -1349,9 +1389,9 @@ void claim_unresolved_symbols(Context &ctx) { auto claim = [&](bool is_imported) { if (sym.is_traced) - SyncOut(ctx) << "trace-symbol: " << *file << ": unresolved" - << (esym.is_weak() ? " weak" : "") - << " symbol " << sym; + Out(ctx) << "trace-symbol: " << *file << ": unresolved" + << (esym.is_weak() ? " weak" : "") + << " symbol " << sym; sym.file = file; sym.origin = 0; @@ -1386,7 +1426,8 @@ void claim_unresolved_symbols(Context &ctx) { // promoted to dynamic symbols for compatibility with other linkers. // Some major programs, notably Firefox, depend on the behavior // (they use this loophole to export symbols from libxul.so). - if (ctx.arg.shared && sym.visibility != STV_HIDDEN && !ctx.arg.z_defs) { + if (ctx.arg.shared && sym.visibility != STV_HIDDEN && + ctx.arg.unresolved_symbols != UNRESOLVED_ERROR) { claim(true); continue; } @@ -1406,6 +1447,14 @@ void scan_relocations(Context &ctx) { file->scan_relocations(ctx); }); + // Word-size absolute relocations (e.g. R_X86_64_64) are handled + // separately because they can be promoted to dynamic relocations. + tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { + if (OutputSection *osec = chunk->to_osec()) + if (osec->shdr.sh_flags & SHF_ALLOC) + osec->scan_abs_relocations(ctx); + }); + // Exit if there was a relocation that refers an undefined symbol. ctx.checkpoint(); @@ -1466,7 +1515,7 @@ void scan_relocations(Context &ctx) { ctx.got->add_tlsdesc_symbol(ctx, sym); if (sym->flags & NEEDS_COPYREL) { - if (((SharedFile *)sym->file)->is_readonly(sym)) + if (ctx.arg.z_relro && ((SharedFile *)sym->file)->is_readonly(sym)) ctx.copyrel_relro->add_symbol(ctx, sym); else ctx.copyrel->add_symbol(ctx, sym); @@ -1479,38 +1528,62 @@ void scan_relocations(Context &ctx) { sym->flags = 0; } - if constexpr (is_alpha) - ctx.extra.got->finalize(); - if (ctx.has_textrel && ctx.arg.warn_textrel) Warn(ctx) << "creating a DT_TEXTREL in an output file"; } +// Compute the is_weak bit for each imported symbol. +// +// If all references to a shared symbol is weak, the symbol is marked +// as weak in .dynsym. +template +void compute_imported_symbol_weakness(Context &ctx) { + Timer t(ctx, "compute_imported_symbol_weakness"); + + tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { + for (i64 i = file->first_global; i < file->elf_syms.size(); i++) { + const ElfSym &esym = file->elf_syms[i]; + Symbol &sym = *file->symbols[i]; + + if (esym.is_undef() && !esym.is_weak() && sym.file && sym.file->is_dso) { + std::scoped_lock lock(sym.mu); + sym.is_weak = false; + } + } + }); +} + // Report all undefined symbols, grouped by symbol. template void report_undef_errors(Context &ctx) { - constexpr i64 max_errors = 3; + constexpr i64 MAX_ERRORS = 3; + + if (ctx.arg.unresolved_symbols == UNRESOLVED_IGNORE) + return; for (auto &pair : ctx.undef_errors) { - std::string_view sym_name = pair.first; + Symbol *sym = pair.first; std::span errors = pair.second; - if (ctx.arg.demangle) - sym_name = demangle(sym_name); - std::stringstream ss; - ss << "undefined symbol: " << sym_name << "\n"; + ss << "undefined symbol: " + << (ctx.arg.demangle ? demangle(*sym) : sym->name()) + << "\n"; - for (i64 i = 0; i < errors.size() && i < max_errors; i++) + for (i64 i = 0; i < errors.size() && i < MAX_ERRORS; i++) ss << errors[i]; - if (errors.size() > max_errors) - ss << ">>> referenced " << (errors.size() - max_errors) << " more times\n"; + if (MAX_ERRORS < errors.size()) + ss << ">>> referenced " << (errors.size() - MAX_ERRORS) << " more times\n"; + + // Remove the trailing '\n' because Error/Warn adds it automatically + std::string msg = ss.str(); + msg.pop_back(); if (ctx.arg.unresolved_symbols == UNRESOLVED_ERROR) - Error(ctx) << ss.str(); - else if (ctx.arg.unresolved_symbols == UNRESOLVED_WARN) - Warn(ctx) << ss.str(); + Error(ctx) << msg; + else + Warn(ctx) << msg; } ctx.checkpoint(); @@ -1546,13 +1619,21 @@ void copy_chunks(Context &ctx) { // For --relocatable and --emit-relocs, we want to copy non-relocation // sections first. This is because REL-type relocation sections (as // opposed to RELA-type) stores relocation addends to target sections. + // + // We also does that for SH4 because despite being RELA, we always need + // to write addends to relocated places for SH4. + auto is_rel = [](Chunk &chunk) { + return chunk.shdr.sh_type == SHT_REL || + (is_sh4 && chunk.shdr.sh_type == SHT_RELA); + }; + tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { - if (chunk->shdr.sh_type != (E::is_rela ? SHT_RELA : SHT_REL)) + if (!is_rel(*chunk)) copy(*chunk); }); tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { - if (chunk->shdr.sh_type == (E::is_rela ? SHT_RELA : SHT_REL)) + if (is_rel(*chunk)) copy(*chunk); }); @@ -1562,8 +1643,21 @@ void copy_chunks(Context &ctx) { // undefined errors. report_undef_errors(ctx); - if constexpr (is_arm32) - fixup_arm_exidx_section(ctx); + // Zero-clear paddings between chunks + auto zero = [&](Chunk *chunk, i64 next_start) { + i64 pos = chunk->shdr.sh_offset + chunk->shdr.sh_size; + memset(ctx.buf + pos, 0, next_start - pos); + }; + + std::vector *> chunks = ctx.chunks; + + std::erase_if(chunks, [](Chunk *chunk) { + return chunk->shdr.sh_type == SHT_NOBITS; + }); + + for (i64 i = 1; i < chunks.size(); i++) + zero(chunks[i - 1], chunks[i]->shdr.sh_offset); + zero(chunks.back(), ctx.output_file->filesize); } template @@ -1575,16 +1669,77 @@ void construct_relr(Context &ctx) { }); } +// The hash function for .gnu.hash. +static u32 djb_hash(std::string_view name) { + u32 h = 5381; + for (u8 c : name) + h = (h << 5) + h + c; + return h; +} + template -void create_output_symtab(Context &ctx) { - Timer t(ctx, "compute_symtab_size"); +void sort_dynsyms(Context &ctx) { + Timer t(ctx, "sort_dynsyms"); - if (!ctx.arg.strip_all && !ctx.arg.retain_symbols_file) { - tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { - chunk->compute_symtab_size(ctx); + std::span *> syms = ctx.dynsym->symbols; + if (syms.empty()) + return; + + // In any symtab, local symbols must precede global symbols. + auto first_global = std::stable_partition(syms.begin() + 1, syms.end(), + [&](Symbol *sym) { + return sym->is_local(ctx); + }); + + // .gnu.hash imposes more restrictions on the order of the symbols in + // .dynsym. + if (ctx.gnu_hash) { + auto first_exported = std::stable_partition(first_global, syms.end(), + [&](Symbol *sym) { + return !sym->is_exported; + }); + + // Count the number of exported symbols to compute the size of .gnu.hash. + i64 num_exported = syms.end() - first_exported; + u32 num_buckets = num_exported / ctx.gnu_hash->LOAD_FACTOR + 1; + + tbb::parallel_for_each(first_exported, syms.end(), [&](Symbol *sym) { + sym->set_djb_hash(ctx, djb_hash(sym->name())); + }); + + tbb::parallel_sort(first_exported, syms.end(), + [&](Symbol *a, Symbol *b) { + return std::tuple(a->get_djb_hash(ctx) % num_buckets, a->name()) < + std::tuple(b->get_djb_hash(ctx) % num_buckets, b->name()); }); + + ctx.gnu_hash->num_buckets = num_buckets; + ctx.gnu_hash->num_exported = num_exported; } + // Compute .dynstr size + ctx.dynstr->dynsym_offset = ctx.dynstr->shdr.sh_size; + + tbb::enumerable_thread_specific size; + tbb::parallel_for((i64)1, (i64)syms.size(), [&](i64 i) { + syms[i]->set_dynsym_idx(ctx, i); + size.local() += syms[i]->name().size() + 1; + }); + + ctx.dynstr->shdr.sh_size += size.combine(std::plus()); + + // ELF's symbol table sh_info holds the offset of the first global symbol. + ctx.dynsym->shdr.sh_info = first_global - syms.begin(); +} + +template +void create_output_symtab(Context &ctx) { + Timer t(ctx, "compute_symtab_size"); + + tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { + chunk->compute_symtab_size(ctx); + }); + tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { file->compute_symtab_size(ctx); }); @@ -1598,69 +1753,82 @@ template void apply_version_script(Context &ctx) { Timer t(ctx, "apply_version_script"); - auto is_simple = [&] { - for (VersionPattern &v : ctx.version_patterns) - if (v.is_cpp || v.pattern.find_first_of("*?[") != v.pattern.npos) - return false; - return true; - }; - - // If all patterns are simple (i.e. not containing any meta- - // characters and is not a C++ name), we can simply look up - // symbols. - if (is_simple()) { - for (VersionPattern &v : ctx.version_patterns) { - Symbol *sym = get_symbol(ctx, v.pattern); + // Assign versions to symbols specified with `extern "C++"` or + // wildcard patterns first. + MultiGlob matcher; + MultiGlob cpp_matcher; - if (!sym->file && !ctx.arg.undefined_version) - Warn(ctx) << v.source << ": cannot assign version `" << v.ver_str - << "` to symbol `" << *sym << "`: symbol not found"; + // The "local:" label has a special meaning in the version script. + // It can appear in any VERSION clause, and it hides matched symbols + // unless other non-local patterns match to them. In other words, + // "local:" has lower precedence than other version definitions. + // + // If two or more non-local patterns match to the same symbol, the + // last one takes precedence. + std::vector patterns = ctx.version_patterns; - if (sym->file && !sym->file->is_dso) - sym->ver_idx = v.ver_idx; - } - return; - } + std::stable_partition(patterns.begin(), patterns.end(), + [](const VersionPattern &pat) { + return pat.ver_idx == VER_NDX_LOCAL; + }); - // Otherwise, use glob pattern matchers. - MultiGlob matcher; - MultiGlob cpp_matcher; + auto has_wildcard = [](std::string_view str) { + return str.find_first_of("*?[") != str.npos; + }; - for (i64 i = 0; i < ctx.version_patterns.size(); i++) { - VersionPattern &v = ctx.version_patterns[i]; + for (i64 i = 0; i < patterns.size(); i++) { + VersionPattern &v = patterns[i]; if (v.is_cpp) { if (!cpp_matcher.add(v.pattern, i)) Fatal(ctx) << "invalid version pattern: " << v.pattern; - } else { + } else if (has_wildcard(v.pattern)) { if (!matcher.add(v.pattern, i)) Fatal(ctx) << "invalid version pattern: " << v.pattern; } } - tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { - for (Symbol *sym : file->get_global_syms()) { - if (sym->file != file) - continue; + if (!matcher.empty() || !cpp_matcher.empty()) { + tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { + for (Symbol *sym : file->get_global_syms()) { + if (sym->file != file) + continue; - std::string_view name = sym->name(); - i64 match = INT64_MAX; + std::string_view name = sym->name(); + i64 match = -1; - if (std::optional idx = matcher.find(name)) - match = std::min(match, *idx); + if (std::optional idx = matcher.find(name)) + match = std::max(match, *idx); - // Match non-mangled symbols against the C++ pattern as well. - // Weird, but required to match other linkers' behavior. - if (!cpp_matcher.empty()) { - if (std::optional s = cpp_demangle(name)) - name = *s; - if (std::optional idx = cpp_matcher.find(name)) - match = std::min(match, *idx); + // Match non-mangled symbols against the C++ pattern as well. + // Weird, but required to match other linkers' behavior. + if (!cpp_matcher.empty()) { + if (std::optional s = demangle_cpp(name)) + name = *s; + if (std::optional idx = cpp_matcher.find(name)) + match = std::max(match, *idx); + } + + if (match != -1) + sym->ver_idx = patterns[match].ver_idx; } + }); + } + + // Next, assign versions to symbols specified by exact name. + // In other words, exact matches have higher precedence over + // wildcard or `extern "C++"` patterns. + for (VersionPattern &v : patterns) { + if (!v.is_cpp && !has_wildcard(v.pattern)) { + Symbol *sym = get_symbol(ctx, v.pattern); + + if (!sym->file && !ctx.arg.undefined_version) + Warn(ctx) << v.source << ": cannot assign version `" << v.ver_str + << "` to symbol `" << *sym << "`: symbol not found"; - if (match != INT64_MAX) - sym->ver_idx = ctx.version_patterns[match].ver_idx; + if (sym->file && !sym->file->is_dso) + sym->ver_idx = v.ver_idx; } - }); + } } template @@ -1675,6 +1843,9 @@ void parse_symbol_version(Context &ctx) { verdefs[ctx.arg.version_definitions[i]] = i + VER_NDX_LAST_RESERVED + 1; tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { + if (file == ctx.internal_obj) + return; + for (i64 i = file->first_global; i < file->elf_syms.size(); i++) { // Match VERSION part of symbol foo@VERSION with version definitions. if (!file->has_symver.get(i - file->first_global)) @@ -1684,14 +1855,8 @@ void parse_symbol_version(Context &ctx) { if (sym->file != file) continue; - std::string_view ver; - - if (file->is_lto_obj) { - ver = file->lto_symbol_versions[i - file->first_global]; - } else { - const char *name = file->symbol_strtab.data() + file->elf_syms[i].st_name; - ver = strchr(name, '@') + 1; - } + const char *name = file->symbol_strtab.data() + file->elf_syms[i].st_name; + std::string_view ver = strchr(name, '@') + 1; bool is_default = false; if (ver.starts_with('@')) { @@ -1724,6 +1889,47 @@ void parse_symbol_version(Context &ctx) { }); } +template +static bool should_export(Context &ctx, Symbol &sym) { + if (sym.visibility == STV_HIDDEN) + return false; + + switch (sym.ver_idx) { + case VER_NDX_UNSPECIFIED: + if (ctx.arg.dynamic_list_data) + if (u32 ty = sym.get_type(); ty != STT_FUNC && ty != STT_GNU_IFUNC) + return true; + if (ctx.arg.shared) + return !((ObjectFile *)sym.file)->exclude_libs; + return ctx.arg.export_dynamic; + case VER_NDX_LOCAL: + return false; + default: + return true; + } +}; + +template +static bool is_protected(Context &ctx, Symbol &sym) { + if (sym.visibility == STV_PROTECTED) + return true; + + switch (ctx.arg.Bsymbolic) { + case BSYMBOLIC_ALL: + return true; + case BSYMBOLIC_NONE: + return false; + case BSYMBOLIC_FUNCTIONS: + return sym.get_type() == STT_FUNC; + case BSYMBOLIC_NON_WEAK: + return !sym.is_weak; + case BSYMBOLIC_NON_WEAK_FUNCTIONS: + return !sym.is_weak && sym.get_type() == STT_FUNC; + default: + unreachable(); + } +} + template void compute_import_export(Context &ctx) { Timer t(ctx, "compute_import_export"); @@ -1742,51 +1948,29 @@ void compute_import_export(Context &ctx) { }); } - auto should_export = [&](Symbol &sym) { - if (sym.visibility == STV_HIDDEN) - return false; - - switch (sym.ver_idx) { - case VER_NDX_UNSPECIFIED: - if (ctx.arg.shared) - return !((ObjectFile *)sym.file)->exclude_libs; - return ctx.arg.export_dynamic; - case VER_NDX_LOCAL: - return false; - default: - return true; - } - }; - // Export symbols that are not hidden or marked as local. // We also want to mark imported symbols as such. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (Symbol *sym : file->get_global_syms()) { // If we are using a symbol in a DSO, we need to import it. if (sym->file && sym->file->is_dso) { - if (!sym->is_absolute()) { - std::scoped_lock lock(sym->mu); - sym->is_imported = true; - } + std::scoped_lock lock(sym->mu); + sym->is_imported = true; continue; } // If we have a definition of a symbol, we may want to export it. - if (sym->file == file && should_export(*sym)) { + if (sym->file == file && should_export(ctx, *sym)) { sym->is_exported = true; // Exported symbols are marked as imported as well by default // for DSOs. - if (ctx.arg.shared && - sym->visibility != STV_PROTECTED && - !ctx.arg.Bsymbolic && - !(ctx.arg.Bsymbolic_functions && sym->get_type() == STT_FUNC)) + if (ctx.arg.shared && !is_protected(ctx, *sym)) sym->is_imported = true; } } }); - // Apply --dynamic-list, --export-dynamic-symbol and // --export-dynamic-symbol-list options. // @@ -1843,7 +2027,7 @@ void compute_import_export(Context &ctx) { if (matcher.find(name)) { handle_match(sym); } else if (!cpp_matcher.empty()) { - if (std::optional s = cpp_demangle(name)) + if (std::optional s = demangle_cpp(name)) name = *s; if (cpp_matcher.find(name)) handle_match(sym); @@ -1907,14 +2091,14 @@ void compute_address_significance(Context &ctx) { }; // Some symbols' pointer values are leaked to the dynamic section. - mark(get_symbol(ctx, ctx.arg.entry)); - mark(get_symbol(ctx, ctx.arg.init)); - mark(get_symbol(ctx, ctx.arg.fini)); + mark(ctx.arg.entry); + mark(ctx.arg.init); + mark(ctx.arg.fini); // Exported symbols are conservatively considered address-taken. if (ctx.dynsym) for (Symbol *sym : ctx.dynsym->symbols) - if (sym->is_exported) + if (sym && sym->is_exported) mark(sym); // Handle data objects. @@ -1922,11 +2106,8 @@ void compute_address_significance(Context &ctx) { if (InputSection *sec = file->llvm_addrsig.get()) { u8 *p = (u8 *)sec->contents.data(); u8 *end = p + sec->contents.size(); - while (p != end) { - Symbol *sym = file->symbols[read_uleb(&p)]; - if (InputSection *isec = sym->get_input_section()) - isec->address_taken = true; - } + while (p != end) + mark(file->symbols[read_uleb(&p)]); } else { for (std::unique_ptr> &isec : file->sections) if (isec && !(isec->shdr().sh_flags & SHF_EXECINSTR)) @@ -1935,26 +2116,6 @@ void compute_address_significance(Context &ctx) { }); } -template -void clear_padding(Context &ctx) { - Timer t(ctx, "clear_padding"); - - auto zero = [&](Chunk *chunk, i64 next_start) { - i64 pos = chunk->shdr.sh_offset + chunk->shdr.sh_size; - memset(ctx.buf + pos, 0, next_start - pos); - }; - - std::vector *> chunks = ctx.chunks; - - std::erase_if(chunks, [](Chunk *chunk) { - return chunk->shdr.sh_type == SHT_NOBITS; - }); - - for (i64 i = 1; i < chunks.size(); i++) - zero(chunks[i - 1], chunks[i]->shdr.sh_offset); - zero(chunks.back(), ctx.output_file->filesize); -} - // We want to sort output chunks in the following order. // // @@ -1976,7 +2137,6 @@ void clear_padding(Context &ctx) { // // .got // .toc -// .alpha_got // // .relro_padding // @@ -2069,15 +2229,6 @@ void sort_output_sections_regular(Context &ctx) { return 2; if (chunk->name == ".toc") return 3; - if (chunk->name == ".alpha_got") - return 4; - - if (shdr.sh_flags & SHF_MERGE) { - if (shdr.sh_flags & SHF_STRINGS) - return (5LL << 32) | shdr.sh_entsize; - return (6LL << 32) | shdr.sh_entsize; - } - if (chunk == ctx.relro_padding) return INT64_MAX; return 0; @@ -2154,11 +2305,6 @@ void sort_output_sections(Context &ctx) { sort_output_sections_by_order(ctx); } -template -static bool is_tbss(Chunk *chunk) { - return (chunk->shdr.sh_type == SHT_NOBITS) && (chunk->shdr.sh_flags & SHF_TLS); -} - // This function assigns virtual addresses to output sections. Assigning // addresses is a bit tricky because we want to pack sections as tightly // as possible while not violating the constraints imposed by the hardware @@ -2224,6 +2370,10 @@ static void set_virtual_addresses_regular(Context &ctx) { return chunk == first_tls_chunk ? tls_alignment : (u64)chunk->shdr.sh_addralign; }; + auto is_tbss = [](Chunk *chunk) { + return (chunk->shdr.sh_type == SHT_NOBITS) && (chunk->shdr.sh_flags & SHF_TLS); + }; + for (i64 i = 0; i < chunks.size(); i++) { if (!(chunks[i]->shdr.sh_flags & SHF_ALLOC)) continue; @@ -2257,7 +2407,7 @@ static void set_virtual_addresses_regular(Context &ctx) { i64 flags1 = get_flags(chunks[i - 1]); i64 flags2 = get_flags(chunks[i]); - if (flags1 != flags2) { + if (!ctx.arg.nmagic && flags1 != flags2) { switch (ctx.arg.z_separate_code) { case SEPARATE_LOADABLE_SEGMENTS: addr = align_to(addr, ctx.page_size); @@ -2403,6 +2553,7 @@ static i64 set_file_offsets(Context &ctx) { } if (first.shdr.sh_type == SHT_NOBITS) { + first.shdr.sh_offset = fileoff; i++; continue; } @@ -2443,13 +2594,33 @@ static i64 set_file_offsets(Context &ctx) { while (i < chunks.size() && (chunks[i]->shdr.sh_flags & SHF_ALLOC) && - chunks[i]->shdr.sh_type == SHT_NOBITS) + chunks[i]->shdr.sh_type == SHT_NOBITS) { + chunks[i]->shdr.sh_offset = fileoff; i++; + } } return fileoff; } +// Remove debug sections from ctx.chunks and save them to ctx.debug_chunks. +// This is for --separate-debug-file. +template +void separate_debug_sections(Context &ctx) { + auto is_debug_section = [&](Chunk *chunk) { + if (chunk->shdr.sh_flags & SHF_ALLOC) + return false; + return chunk == ctx.gdb_index || chunk == ctx.symtab || chunk == ctx.strtab || + chunk->name.starts_with(".debug_"); + }; + + auto mid = std::stable_partition(ctx.chunks.begin(), ctx.chunks.end(), + is_debug_section); + + ctx.debug_chunks = {ctx.chunks.begin(), mid}; + ctx.chunks.erase(ctx.chunks.begin(), mid); +} + template void compute_section_headers(Context &ctx) { // Update sh_size for each chunk. @@ -2458,14 +2629,14 @@ void compute_section_headers(Context &ctx) { // Remove empty chunks. std::erase_if(ctx.chunks, [&](Chunk *chunk) { - return chunk->kind() != OUTPUT_SECTION && chunk != ctx.gdb_index && + return !chunk->to_osec() && chunk != ctx.gdb_index && chunk->shdr.sh_size == 0; }); // Set section indices. i64 shndx = 1; for (i64 i = 0; i < ctx.chunks.size(); i++) - if (ctx.chunks[i]->kind() != HEADER) + if (!ctx.chunks[i]->is_header()) ctx.chunks[i]->shndx = shndx++; if (ctx.symtab && SHN_LORESERVE <= shndx) { @@ -2551,7 +2722,7 @@ void fix_synthetic_symbols(Context &ctx) { std::vector *> sections; for (Chunk *chunk : ctx.chunks) - if (chunk->kind() != HEADER && (chunk->shdr.sh_flags & SHF_ALLOC)) + if (!chunk->is_header() && (chunk->shdr.sh_flags & SHF_ALLOC)) sections.push_back(chunk); auto find = [&](std::string name) -> Chunk * { @@ -2587,7 +2758,7 @@ void fix_synthetic_symbols(Context &ctx) { // If we set values to these symbols in a static PIE, glibc attempts // to run ifunc initializers twice, with the second attempt with wrong // function addresses, causing a segmentation fault. - if (ctx.reldyn && ctx.arg.is_static && !ctx.arg.pie) { + if (ctx.reldyn && ctx.arg.static_ && !ctx.arg.pie) { stop(ctx.__rel_iplt_start, ctx.reldyn); stop(ctx.__rel_iplt_end, ctx.reldyn); @@ -2688,6 +2859,19 @@ void fix_synthetic_symbols(Context &ctx) { } } + // PPC64's _{save,rest}gpr{0,1}_{14,15,16,...,31} symbols + if constexpr (is_ppc64v2) { + i64 offset = 0; + for (std::pair p : ppc64_save_restore_insns) { + std::string_view label = p.first; + if (!label.empty()) + if (Symbol *sym = get_symbol(ctx, label); + sym->file == ctx.internal_obj) + start(sym, ctx.extra.save_restore, offset); + offset += 4; + } + } + // __start_ and __stop_ symbols for (Chunk *chunk : sections) { if (std::optional name = get_start_stop_name(ctx, *chunk)) { @@ -2724,7 +2908,6 @@ void fix_synthetic_symbols(Context &ctx) { } } - // --section-order symbols for (SectionOrder &ord : ctx.arg.section_order) if (ord.type == SectionOrder::SYMBOL) @@ -2732,7 +2915,7 @@ void fix_synthetic_symbols(Context &ctx) { } template -i64 compress_debug_sections(Context &ctx) { +void compress_debug_sections(Context &ctx) { Timer t(ctx, "compress_debug_sections"); tbb::parallel_for((i64)0, (i64)ctx.chunks.size(), [&](i64 i) { @@ -2754,8 +2937,179 @@ i64 compress_debug_sections(Context &ctx) { ctx.ehdr->update_shdr(ctx); if (ctx.shdr) ctx.shdr->update_shdr(ctx); +} + +// BLAKE3 is a cryptographic hash function just like SHA256. +// We use it instead of SHA256 because it's faster. +static void blake3_hash(u8 *buf, i64 size, u8 *out) { + blake3_hasher hasher; + blake3_hasher_init(&hasher); + blake3_hasher_update(&hasher, buf, size); + blake3_hasher_finalize(&hasher, out, BLAKE3_OUT_LEN); +} + +template +std::vector> get_shards(Context &ctx) { + constexpr i64 shard_size = 4 * 1024 * 1024; // 4 MiB + std::span buf = {ctx.buf, (size_t)ctx.output_file->filesize}; + std::vector> vec; + + while (!buf.empty()) { + i64 sz = std::min(shard_size, buf.size()); + vec.push_back(buf.subspan(0, sz)); + buf = buf.subspan(sz); + } + return vec; +} + +template +void write_build_id(Context &ctx) { + Timer t(ctx, "write_build_id"); + + switch (ctx.arg.build_id.kind) { + case BuildId::HEX: + ctx.buildid->contents = ctx.arg.build_id.value; + break; + case BuildId::HASH: { + std::vector> shards = get_shards(ctx); + std::vector hashes(shards.size() * BLAKE3_OUT_LEN); + + tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) { + blake3_hash(shards[i].data(), shards[i].size(), + hashes.data() + i * BLAKE3_OUT_LEN); + +#ifdef HAVE_MADVISE + // Make the kernel page out the file contents we've just written + // so that subsequent close(2) call will become quicker. + if (i > 0 && ctx.output_file->is_mmapped) + madvise(begin, end - begin, MADV_DONTNEED); +#endif + }); + + u8 buf[BLAKE3_OUT_LEN]; + blake3_hash(hashes.data(), hashes.size(), buf); + + assert(ctx.arg.build_id.size() <= BLAKE3_OUT_LEN); + ctx.buildid->contents = {buf, buf + ctx.arg.build_id.size()}; + break; + } + case BuildId::UUID: { + u8 buf[16]; + get_random_bytes(buf, 16); + + // Indicate that this is UUIDv4 as defined by RFC4122 + buf[6] = (buf[6] & 0b0000'1111) | 0b0100'0000; + buf[8] = (buf[8] & 0b0011'1111) | 0b1000'0000; + ctx.buildid->contents = {buf, buf + 16}; + break; + } + default: + unreachable(); + } + + ctx.buildid->copy_buf(ctx); +} + +// A .gnu_debuglink section contains a filename and a CRC32 checksum of a +// debug info file. When we are writing a .gnu_debuglink, we don't know +// its CRC32 checksum because we haven't created a debug info file. So we +// write a dummy value instead. +// +// We can't choose a random value as a dummy value for build +// reproducibility. We also don't want to write a fixed value for all +// files because the CRC checksum is in this section to prevent using +// wrong file on debugging. gdb rejects a debug info file if its CRC +// doesn't match with the one in .gdb_debuglink. +// +// Therefore, we'll try to make our CRC checksum as unique as possible. +// We'll remember that checksum, and after creating a debug info file, add +// a few bytes of garbage at the end of it so that the debug info file's +// CRC checksum becomes the one that we have precomputed. +template +void write_gnu_debuglink(Context &ctx) { + Timer t(ctx, "write_gnu_debuglink"); + u32 crc32; + + if (ctx.buildid) { + crc32 = compute_crc32(0, ctx.buildid->contents.data(), + ctx.buildid->contents.size()); + } else { + std::vector> shards = get_shards(ctx); + std::vector> hashes(shards.size()); + + tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) { + hashes[i] = hash_string({(char *)shards[i].data(), shards[i].size()}); + }); + crc32 = compute_crc32(0, (u8 *)hashes.data(), hashes.size() * 8); + } + + ctx.gnu_debuglink->crc32 = crc32; + ctx.gnu_debuglink->copy_buf(ctx); +} + +// Write a separate debug file. This function is called after we finish +// writing to the usual output file. +template +void write_separate_debug_file(Context &ctx) { + Timer t(ctx, "write_separate_debug_file"); + + // Open an output file early + LockingOutputFile *file = + new LockingOutputFile(ctx, ctx.arg.separate_debug_file, 0666); + + // We want to write to the debug info file in background so that the + // user doesn't have to wait for it to complete. + if (ctx.arg.detach) + notify_parent(); + + // A debug info file contains all sections as the original file, though + // most of them can be empty as if they were bss sections. We convert + // real sections into dummy sections here. + for (i64 i = 0; i < ctx.chunks.size(); i++) { + Chunk *chunk = ctx.chunks[i]; + if (chunk != ctx.ehdr && chunk != ctx.shdr && chunk != ctx.shstrtab && + chunk->shdr.sh_type != SHT_NOTE) { + Chunk *sec = new OutputSection(chunk->name, SHT_NULL); + sec->shdr = chunk->shdr; + sec->shdr.sh_type = SHT_NOBITS; + + ctx.chunks[i] = sec; + ctx.chunk_pool.emplace_back(sec); + } + } + + // Restore debug info sections that had been set aside while we were + // creating the main file. + tbb::parallel_for_each(ctx.debug_chunks, [&](Chunk *chunk) { + chunk->compute_section_size(ctx); + }); + + append(ctx.chunks, ctx.debug_chunks); + + // Write to the debug info file as if it were a regular output file. + compute_section_headers(ctx); + file->resize(ctx, set_osec_offsets(ctx)); + + ctx.output_file.reset(file); + ctx.buf = ctx.output_file->buf; + + copy_chunks(ctx); + + if (ctx.gdb_index) + write_gdb_index(ctx); + + // Reverse-compute a CRC32 value so that the CRC32 checksum embedded to + // the .gnu_debuglink section in the main executable matches with the + // debug info file's CRC32 checksum. + u32 crc = compute_crc32(0, ctx.buf, ctx.output_file->filesize); + + std::vector &buf2 = ctx.output_file->buf2; + if (!buf2.empty()) + crc = compute_crc32(crc, buf2.data(), buf2.size()); - return set_osec_offsets(ctx); + std::vector trailer = crc32_solve(crc, ctx.gnu_debuglink->crc32); + append(ctx.output_file->buf2, trailer); + ctx.output_file->close(ctx); } // Write Makefile-style dependency rules to a file specified by @@ -2765,8 +3119,8 @@ void write_dependency_file(Context &ctx) { std::vector deps; std::unordered_set seen; - for (std::unique_ptr>> &mf : ctx.mf_pool) - if (!mf->parent) + for (std::unique_ptr &mf : ctx.mf_pool) + if (mf->is_dependency && !mf->parent) if (std::string path = path_clean(mf->name); seen.insert(path).second) deps.push_back(path); @@ -2829,7 +3183,7 @@ void show_stats(Context &ctx) { } static Counter num_bytes("total_input_bytes"); - for (std::unique_ptr>> &mf : ctx.mf_pool) + for (std::unique_ptr &mf : ctx.mf_pool) num_bytes += mf->size; static Counter num_input_sections("input_sections"); @@ -2844,7 +3198,7 @@ void show_stats(Context &ctx) { static Counter thunk_bytes("thunk_bytes"); for (Chunk *chunk : ctx.chunks) if (OutputSection *osec = chunk->to_osec()) - for (std::unique_ptr> &thunk : osec->thunks) + for (std::unique_ptr> &thunk : osec->thunks) thunk_bytes += thunk->size(); } @@ -2861,16 +3215,18 @@ template void create_internal_file(Context &); template void apply_exclude_libs(Context &); template void create_synthetic_sections(Context &); template void resolve_symbols(Context &); -template void kill_eh_frame_sections(Context &); -template void resolve_section_pieces(Context &); +template void do_lto(Context &); +template void parse_eh_frame_sections(Context &); +template void create_merged_sections(Context &); template void convert_common_symbols(Context &); -template void compute_merged_section_sizes(Context &); template void create_output_sections(Context &); template void add_synthetic_symbols(Context &); template void check_cet_errors(Context &); +template void apply_section_align(Context &); template void print_dependencies(Context &); template void write_repro_file(Context &); template void check_duplicate_symbols(Context &); +template void check_shlib_undefined(Context &); template void check_symbol_types(Context &); template void sort_init_fini(Context &); template void sort_ctor_dtor(Context &); @@ -2879,22 +3235,27 @@ template void shuffle_sections(Context &); template void compute_section_sizes(Context &); template void sort_output_sections(Context &); template void claim_unresolved_symbols(Context &); +template void compute_imported_symbol_weakness(Context &); template void scan_relocations(Context &); template void report_undef_errors(Context &); template void create_reloc_sections(Context &); template void copy_chunks(Context &); template void construct_relr(Context &); +template void sort_dynsyms(Context &); template void create_output_symtab(Context &); template void apply_version_script(Context &); template void parse_symbol_version(Context &); template void compute_import_export(Context &); template void compute_address_significance(Context &); -template void clear_padding(Context &); +template void separate_debug_sections(Context &); template void compute_section_headers(Context &); template i64 set_osec_offsets(Context &); template void fix_synthetic_symbols(Context &); -template i64 compress_debug_sections(Context &); +template void compress_debug_sections(Context &); +template void write_build_id(Context &); +template void write_gnu_debuglink(Context &); +template void write_separate_debug_file(Context &); template void write_dependency_file(Context &); template void show_stats(Context &); -} // namespace mold::elf +} // namespace mold diff --git a/elf/relocatable.cc b/src/relocatable.cc similarity index 96% rename from elf/relocatable.cc rename to src/relocatable.cc index 01bf6d39..639dc6ae 100644 --- a/elf/relocatable.cc +++ b/src/relocatable.cc @@ -35,7 +35,7 @@ #include #include -namespace mold::elf { +namespace mold { // Create linker-synthesized sections template @@ -148,8 +148,6 @@ static u64 r_set_osec_offsets(Context &ctx) { template void combine_objects(Context &ctx) { - compute_merged_section_sizes(ctx); - create_output_sections(ctx); r_create_synthetic_sections(ctx); @@ -171,12 +169,10 @@ void combine_objects(Context &ctx) { compute_section_headers(ctx); i64 filesize = r_set_osec_offsets(ctx); - ctx.output_file = - OutputFile>::open(ctx, ctx.arg.output, filesize, 0666); + ctx.output_file = OutputFile::open(ctx, ctx.arg.output, filesize, 0666); ctx.buf = ctx.output_file->buf; copy_chunks(ctx); - clear_padding(ctx); ctx.output_file->close(ctx); ctx.checkpoint(); @@ -197,4 +193,4 @@ using E = MOLD_TARGET; template void combine_objects(Context &); -} // namespace mold::elf +} // namespace mold diff --git a/src/shrink-sections.cc b/src/shrink-sections.cc new file mode 100644 index 00000000..cfd3f4b3 --- /dev/null +++ b/src/shrink-sections.cc @@ -0,0 +1,151 @@ +// Since RISC instructions are generally up to 32 bits long, there's no +// way to embed very large immediates into their branch instructions. For +// example, RISC-V's JAL (jump and link) instruction can jump to only +// within PC ± 1 MiB because its immediate is 21 bits long. If the +// destination is further than that, we need to use two instructions +// instead; the first instruction being AUIPC, which sets the upper 20 +// bits of a displacement to a register, and the second being JALR, which +// specifies the lower 12 bits and the register. Combined, they specify a +// 32-bit displacement, which is sufficient to support the medium code +// model. +// +// However, always using two or more instructions for function calls is a +// waste of time and space if the branch target is within a single +// instruction's reach. There are two approaches to address this problem +// as follows: +// +// 1. The compiler optimistically emits a single branch instruction for +// all function calls. The linker then checks if the branch target is +// reachable, and if not, redirects the branch to a linker-synthesized +// code sequence that uses two or more instructions to branch further. +// That linker-synthesized code is called a "thunk". All RISC psABIs +// except RISC-V and LoongArch take this approach. +// +// 2. The compiler pessimistically emits two instructions to branch +// anywhere in PC ± 2 GiB, and the linker rewrites them with a single +// instruction if the branch target is close enough. RISC-V and +// LoongArch take this approach. +// +// This file contains functions to support (2). For (1), see thunks.cc. +// +// With the presence of this code-shrinking relaxation, sections can no +// longer be considered as atomic units. If we delete an instruction from +// the middle of a section, the section contents after that point need to +// be shifted by the size of the instruction. Symbol values and relocation +// offsets have to be shifted too if they refer to bytes past the deleted +// ones. +// +// In mold, we use `r_deltas` to memorize how many bytes have been shifted +// for relocations. For symbols, we directly mutate their `value` member. +// +// RISC-V and LoongArch object files tend to have way more relocations +// than those for other targets. This is because all branches, including +// those that jump within the same section, are explicitly expressed with +// relocations. Here is why we need them: all control-flow statements, +// such as `if` or `for`, are implemented using branch instructions. For +// other targets, the compiler doesn't emit relocations for such branches +// because it knows at compile-time exactly how many bytes have to be +// skipped. That's not true in RISC-V and LoongArch because the linker may +// delete bytes between a branch and its target. Therefore, all branches, +// including in-section ones, have to be explicitly expressed with +// relocations. +// +// Note that this mechanism only shrinks sections and never enlarges them, +// as the compiler always emits the longest instruction sequence. This +// makes the linker implementation a bit simpler because we don't need to +// worry about oscillation. + +#if MOLD_RV64LE || MOLD_RV64BE || MOLD_RV32LE || MOLD_RV32BE || \ + MOLD_LOONGARCH64 || MOLD_LOONGARCH32 + +#include "mold.h" + +#include + +namespace mold { + +using E = MOLD_TARGET; + +static bool is_resizable(InputSection *isec) { + return isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC) && + (isec->shdr().sh_flags & SHF_EXECINSTR); +} + +template <> +void shrink_sections(Context &ctx) { + Timer t(ctx, "shrink_sections"); + + // True if we can use the 2-byte instructions. This is usually true on + // Unix because RV64GC is generally considered the baseline hardware. + bool use_rvc = false; + if constexpr (is_riscv) + use_rvc = get_eflags(ctx) & EF_RISCV_RVC; + + // Find all relaxable relocations and record how many bytes we can save + // into r_deltas. + // + // Technically speaking, relaxing relocations may allow more relocations + // to be relaxed because the distance between a branch instruction and + // its target may decrease as a result of relaxation. That said, the + // number of such relocations is negligible (I tried to self-host mold + // on RISC-V as an experiment and found that the mold-built .text is + // only ~0.04% larger than that of GNU ld), so we don't bother to handle + // them. We scan relocations only once here. + tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { + for (std::unique_ptr> &isec : file->sections) + if (is_resizable(isec.get())) + shrink_section(ctx, *isec, use_rvc); + }); + + // Fix symbol values. + tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { + for (Symbol *sym : file->symbols) { + if (sym->file != file) + continue; + + InputSection *isec = sym->get_input_section(); + if (!isec || isec->extra.r_deltas.empty()) + continue; + + std::span> rels = isec->get_rels(ctx); + auto it = std::lower_bound(rels.begin(), rels.end(), sym->value, + [&](const ElfRel &r, u64 val) { + return r.r_offset < val; + }); + + sym->value -= isec->extra.r_deltas[it - rels.begin()]; + } + }); + + // Recompute sizes of executable sections + tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { + if (chunk->to_osec() && (chunk->shdr.sh_flags & SHF_EXECINSTR)) + chunk->compute_section_size(ctx); + }); +} + +// Returns the distance between a relocated place and a symbol. +template <> +i64 compute_distance(Context &ctx, Symbol &sym, + InputSection &isec, const ElfRel &rel) { + // We handle absolute symbols as if they were infinitely far away + // because `shrink_section` may increase a distance between a branch + // instruction and an absolute symbol. Branching to an absolute + // location is extremely rare in real code, though. + if (sym.is_absolute()) + return INT64_MAX; + + // Likewise, relocations against weak undefined symbols won't be relaxed. + if (sym.esym().is_undef_weak()) + return INT64_MAX; + + // Compute a distance between the relocated place and the symbol. + i64 S = sym.get_addr(ctx); + i64 A = rel.r_addend; + i64 P = isec.get_addr() + rel.r_offset; + return S + A - P; +} + +} // namespace mold + +#endif diff --git a/elf/subprocess.cc b/src/subprocess-unix.cc similarity index 88% rename from elf/subprocess.cc rename to src/subprocess-unix.cc index 51be8972..44e5e65a 100644 --- a/elf/subprocess.cc +++ b/src/subprocess-unix.cc @@ -1,5 +1,3 @@ -#if !defined(_WIN32) && !defined(__APPLE__) - #include "mold.h" #include "config.h" @@ -11,13 +9,15 @@ #include #include -namespace mold::elf { +namespace mold { #ifdef MOLD_X86_64 +static int pipe_write_fd = -1; + // Exiting from a program with large memory usage is slow -- // it may take a few hundred milliseconds. To hide the latency, // we fork a child and let it do the actual linking work. -std::function fork_child() { +void fork_child() { int pipefd[2]; if (pipe(pipefd) == -1) { perror("pipe"); @@ -50,12 +50,17 @@ std::function fork_child() { // Child close(pipefd[0]); + pipe_write_fd = pipefd[1]; +} + +void notify_parent() { + if (pipe_write_fd == -1) + return; - return [=] { - char buf[] = {1}; - [[maybe_unused]] int n = write(pipefd[1], buf, 1); - assert(n == 1); - }; + char buf[] = {1}; + [[maybe_unused]] int n = write(pipe_write_fd, buf, 1); + assert(n == 1); + pipe_write_fd = -1; } #endif @@ -84,6 +89,9 @@ static std::string find_dso(Context &ctx, std::filesystem::path self) { template [[noreturn]] void process_run_subcommand(Context &ctx, int argc, char **argv) { +#ifdef __APPLE__ + Fatal(ctx) << "-run is not supported on macOS"; +#else assert(argv[1] == "-run"s || argv[1] == "--run"s); if (!argv[2]) @@ -111,12 +119,11 @@ void process_run_subcommand(Context &ctx, int argc, char **argv) { // Execute a given command execvp(argv[2], argv + 2); Fatal(ctx) << "mold -run failed: " << argv[2] << ": " << errno_string(); +#endif } using E = MOLD_TARGET; template void process_run_subcommand(Context &, int, char **); -} // namespace mold::elf - -#endif +} // namespace mold diff --git a/src/subprocess-win32.cc b/src/subprocess-win32.cc new file mode 100644 index 00000000..fb336827 --- /dev/null +++ b/src/subprocess-win32.cc @@ -0,0 +1,20 @@ +#include "mold.h" + +namespace mold { + +#ifdef MOLD_X86_64 +void fork_child() {} +void notify_parent() {} +#endif + +template +[[noreturn]] +void process_run_subcommand(Context &ctx, int argc, char **argv) { + Fatal(ctx) << "-run is supported only on Unix"; +} + +using E = MOLD_TARGET; + +template void process_run_subcommand(Context &, int, char **); + +} // namespace mold diff --git a/elf/thunks.cc b/src/thunks.cc similarity index 83% rename from elf/thunks.cc rename to src/thunks.cc index 385e23db..c5a99fbc 100644 --- a/elf/thunks.cc +++ b/src/thunks.cc @@ -20,15 +20,14 @@ // we don't need to try too hard to reduce thunk size to the absolute // minimum. -#if MOLD_ARM32 || MOLD_ARM64 || MOLD_PPC32 || MOLD_PPC64V1 || MOLD_PPC64V2 || \ - MOLD_LOONGARCH64 || MOLD_LOONGARCH32 +#if MOLD_ARM32 || MOLD_ARM64 || MOLD_PPC32 || MOLD_PPC64V1 || MOLD_PPC64V2 #include "mold.h" #include #include -namespace mold::elf { +namespace mold { using E = MOLD_TARGET; @@ -39,9 +38,7 @@ static consteval i64 max_distance() { // and therefore the least two bits are always zero. So the branch // operand is effectively 28 bits long. That means the branch range is // [-2^27, 2^27) or PC ± 128 MiB. - // - // LoongArch's BR instruction also takes a 26 bit immediate. - if (is_arm64 || is_loongarch) + if (is_arm64) return 1 << 27; // ARM32's Thumb branch has 24 bits immediate, and the instructions are @@ -63,10 +60,10 @@ static consteval i64 max_distance() { // ARM64/ARM32/PPC, respectively. static constexpr i64 batch_size = max_distance() / 10; -// We assume that a single thunk group is smaller than 100 KiB. -static constexpr i64 max_thunk_size = 102400; +// We assume that a single thunk group is smaller than 900 KiB. +static constexpr i64 max_thunk_size = 900 * 1024; -static_assert(max_thunk_size / E::thunk_size < INT16_MAX); +static_assert(max_thunk_size / E::thunk_size < ThunkRef::MAX_SYM_IDX); template static bool is_reachable(Context &ctx, InputSection &isec, @@ -119,7 +116,7 @@ static bool is_reachable(Context &ctx, InputSection &isec, return -max_distance() <= val && val < max_distance(); } -static void reset_thunk(RangeExtensionThunk &thunk) { +static void reset_thunk(Thunk &thunk) { for (Symbol *sym : thunk.symbols) { sym->extra.thunk_idx = -1; sym->extra.thunk_sym_idx = -1; @@ -129,10 +126,10 @@ static void reset_thunk(RangeExtensionThunk &thunk) { // Scan relocations to collect symbols that need thunks. static void scan_rels(Context &ctx, InputSection &isec, - RangeExtensionThunk &thunk, i64 thunk_idx) { + Thunk &thunk, i64 thunk_idx) { std::span> rels = isec.get_rels(ctx); - std::vector &range_extn = isec.extra.range_extn; - range_extn.resize(rels.size()); + std::vector &thunk_refs = isec.extra.thunk_refs; + thunk_refs.resize(rels.size()); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; @@ -151,15 +148,15 @@ static void scan_rels(Context &ctx, InputSection &isec, // This relocation needs a thunk. If the symbol is already in a // previous thunk, reuse it. if (sym.extra.thunk_idx != -1) { - range_extn[i].thunk_idx = sym.extra.thunk_idx; - range_extn[i].sym_idx = sym.extra.thunk_sym_idx; + thunk_refs[i].thunk_idx = sym.extra.thunk_idx; + thunk_refs[i].sym_idx = sym.extra.thunk_sym_idx; continue; } // Otherwise, add the symbol to the current thunk if it's not // added already. - range_extn[i].thunk_idx = thunk_idx; - range_extn[i].sym_idx = -1; + thunk_refs[i].thunk_idx = thunk_idx; + thunk_refs[i].sym_idx = -1; if (sym.flags.exchange(-1) == 0) { std::scoped_lock lock(thunk.mu); @@ -170,8 +167,6 @@ static void scan_rels(Context &ctx, InputSection &isec, template <> void OutputSection::create_range_extension_thunks(Context &ctx) { - using Thunk = RangeExtensionThunk; - std::span *> m = members; if (m.empty()) return; @@ -179,9 +174,9 @@ void OutputSection::create_range_extension_thunks(Context &ctx) { // Initialize input sections with a dummy offset so that we can // distinguish sections that have got an address with the one who // haven't. - tbb::parallel_for_each(m, [](InputSection *isec) { + for (InputSection *isec : m) isec->offset = -1; - }); + thunks.clear(); // We create thunks from the beginning of the section to the end. // We manage progress using four offsets which increase monotonically. @@ -215,7 +210,7 @@ void OutputSection::create_range_extension_thunks(Context &ctx) { // Move D foward as far as we can jump from B to a thunk at D. auto d_thunk_end = [&] { u64 d_end = align_to(offset, 1 << m[d]->p2align) + m[d]->sh_size; - return align_to(d_end, Thunk::alignment) + max_thunk_size; + return align_to(d_end, Thunk::alignment) + max_thunk_size; }; while (d < m.size() && @@ -243,17 +238,15 @@ void OutputSection::create_range_extension_thunks(Context &ctx) { reset_thunk(*thunks[t++]); // Create a new thunk and place it at D. - offset = align_to(offset, Thunk::alignment); + offset = align_to(offset, Thunk::alignment); i64 thunk_idx = thunks.size(); - Thunk *thunk = new Thunk(*this, offset); + Thunk *thunk = new Thunk(*this, offset); thunks.emplace_back(thunk); // Scan relocations between B and C to collect symbols that need // entries in the new thunk. - tbb::parallel_for_each(m.begin() + b, m.begin() + c, - [&](InputSection *isec) { - scan_rels(ctx, *isec, *thunk, thunk_idx); - }); + for (i64 i = b; i < c; i++) + scan_rels(ctx, *m[i], *thunk, thunk_idx); // Now that we know the number of symbols in the thunk, we can compute // the thunk's size. @@ -267,23 +260,21 @@ void OutputSection::create_range_extension_thunks(Context &ctx) { }); // Assign offsets within the thunk to the symbols. - for (i64 i = 0; i < thunk->symbols.size(); i++) { - Symbol &sym = *thunk->symbols[i]; - sym.extra.thunk_idx = thunk_idx; - sym.extra.thunk_sym_idx = i; + for (i64 i = 0; Symbol *sym : thunk->symbols) { + sym->extra.thunk_idx = thunk_idx; + sym->extra.thunk_sym_idx = i++; } // Scan relocations again to fix symbol offsets in the last thunk. - tbb::parallel_for_each(m.begin() + b, m.begin() + c, - [&](InputSection *isec) { - std::span *> syms = isec->file.symbols; - std::span> rels = isec->get_rels(ctx); - std::span range_extn = isec->extra.range_extn; - - for (i64 i = 0; i < rels.size(); i++) - if (range_extn[i].thunk_idx == thunk_idx) - range_extn[i].sym_idx = syms[rels[i].r_sym]->extra.thunk_sym_idx; - }); + for (i64 i = b; i < c; i++) { + std::span *> syms = m[i]->file.symbols; + std::span> rels = m[i]->get_rels(ctx); + std::span thunk_refs = m[i]->extra.thunk_refs; + + for (i64 j = 0; j < rels.size(); j++) + if (thunk_refs[j].thunk_idx == thunk_idx) + thunk_refs[j].sym_idx = syms[rels[j].r_sym]->extra.thunk_sym_idx; + } // Move B forward to point to the begining of the next batch. b = c; @@ -299,6 +290,6 @@ void OutputSection::create_range_extension_thunks(Context &ctx) { std::max(this->shdr.sh_addralign, 1 << isec->p2align); } -} // namespace mold::elf +} // namespace mold #endif diff --git a/elf/tls.cc b/src/tls.cc similarity index 88% rename from elf/tls.cc rename to src/tls.cc index 8d391ace..8d8476d2 100644 --- a/elf/tls.cc +++ b/src/tls.cc @@ -122,44 +122,26 @@ #include "mold.h" -namespace mold::elf { - -template -static ElfPhdr *get_tls_segment(Context &ctx) { - if (ctx.phdr) - for (ElfPhdr &phdr : ctx.phdr->phdrs) - if (phdr.p_type == PT_TLS) - return &phdr; - return nullptr; -} - -template -u64 get_tls_begin(Context &ctx) { - if (ElfPhdr *phdr = get_tls_segment(ctx)) - return phdr->p_vaddr; - return 0; -} +namespace mold { // Returns the TP address which can be used for efficient TLV accesses in // the main executable. TP at runtime refers to a per-process TLS block // whose address is not known at link-time. So the address returned from // this function is the TP if the TLS template image were a TLS block. template -u64 get_tp_addr(Context &ctx) { - ElfPhdr *phdr = get_tls_segment(ctx); - if (!phdr) - return 0; +u64 get_tp_addr(const ElfPhdr &phdr) { + assert(phdr.p_type == PT_TLS); if constexpr (is_x86 || is_sparc || is_s390x) { // On x86, SPARC and s390x, TP (%gs on i386, %fs on x86-64, %g7 on SPARC // and %a0/%a1 on s390x) refers to past the end of the TLS block for // historical reasons. TLVs are accessed with negative offsets from TP. - return align_to(phdr->p_vaddr + phdr->p_memsz, phdr->p_align); - } else if constexpr (is_arm || is_sh4 || is_alpha) { - // On ARM, SH4 and Alpha, the runtime appends two words at the beginning + return align_to(phdr.p_vaddr + phdr.p_memsz, phdr.p_align); + } else if constexpr (is_arm || is_sh4) { + // On ARM and SH4, the runtime appends two words at the beginning // of TLV template image when copying TLVs to the TLS block, so we need // to offset it. - return align_down(phdr->p_vaddr - sizeof(Word) * 2, phdr->p_align); + return align_down(phdr.p_vaddr - sizeof(Word) * 2, phdr.p_align); } else if constexpr (is_ppc || is_m68k) { // On PowerPC and m68k, TP is 0x7000 (28 KiB) past the beginning // of the TLV block to maximize the addressable range of load/store @@ -167,24 +149,22 @@ u64 get_tp_addr(Context &ctx) { // (32 KiB) off because there's a small implementation-defined piece of // data before the initial TLV block, and the runtime wants to access // them efficiently too. - return phdr->p_vaddr + 0x7000; + return phdr.p_vaddr + 0x7000; } else { // RISC-V and LoongArch just uses the beginning of the main executable's // TLV block as TP. Their load/store instructions usually take 12-bits // signed immediates, so the beginning of the TLS block ± 2 KiB is // accessible with a single load/store instruction. static_assert(is_riscv || is_loongarch); - return phdr->p_vaddr; + return phdr.p_vaddr; } } // Returns the address __tls_get_addr() would return if it's called // with offset 0. template -u64 get_dtp_addr(Context &ctx) { - ElfPhdr *phdr = get_tls_segment(ctx); - if (!phdr) - return 0; +u64 get_dtp_addr(const ElfPhdr &phdr) { + assert(phdr.p_type == PT_TLS); if constexpr (is_ppc || is_m68k) { // On PowerPC and m68k, R_DTPOFF is resolved to the address 0x8000 @@ -193,21 +173,20 @@ u64 get_dtp_addr(Context &ctx) { // immediates. That is, if the offset were right at the beginning of the // start of the TLS block, the half of addressible space (negative // immediates) would have been wasted. - return phdr->p_vaddr + 0x8000; + return phdr.p_vaddr + 0x8000; } else if constexpr (is_riscv) { // On RISC-V, the bias is 0x800 as the load/store instructions in the // ISA usually have a 12-bit immediate. - return phdr->p_vaddr + 0x800; + return phdr.p_vaddr + 0x800; } else { // On other targets, DTP simply refers to the beginning of the TLS block. - return phdr->p_vaddr; + return phdr.p_vaddr; } } using E = MOLD_TARGET; -template u64 get_tls_begin(Context &); -template u64 get_tp_addr(Context &); -template u64 get_dtp_addr(Context &); +template u64 get_tp_addr(const ElfPhdr &); +template u64 get_dtp_addr(const ElfPhdr &); -} // namespace mold::elf +} // namespace mold diff --git a/test/elf/CMakeLists.txt b/test/CMakeLists.txt similarity index 65% rename from test/elf/CMakeLists.txt rename to test/CMakeLists.txt index 1ba514bd..e64a1f0e 100644 --- a/test/elf/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,5 +1,5 @@ option(MOLD_ENABLE_QEMU_TESTS "Enable tests on non-native targets" ON) -set(HOST ${CMAKE_HOST_SYSTEM_PROCESSOR}) +set(MACHINE ${CMAKE_HOST_SYSTEM_PROCESSOR}) if(EXISTS "/proc/cpuinfo") file(READ "/proc/cpuinfo" CPUINFO) @@ -13,25 +13,25 @@ execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine OUTPUT_VARIABLE ARCH ERROR_QUIET) if(NOT EXIT_CODE AND ARCH MATCHES "([^-]+).*") - set(HOST ${CMAKE_MATCH_1}) + set(MACHINE ${CMAKE_MATCH_1}) endif() -if(${HOST} MATCHES "amd64") - set(HOST x86_64) -elseif(${HOST} MATCHES "i386") - set(HOST i686) -elseif(${HOST} MATCHES "arm.*") - set(HOST arm) -elseif(${HOST} STREQUAL "powerpc64") - set(HOST ppc64) -elseif(${HOST} STREQUAL "powerpc64le") - set(HOST ppc64le) +if(${MACHINE} MATCHES "amd64") + set(MACHINE x86_64) +elseif(${MACHINE} MATCHES "i386") + set(MACHINE i686) +elseif(${MACHINE} MATCHES "arm.*") + set(MACHINE arm) +elseif(${MACHINE} STREQUAL "powerpc64") + set(MACHINE ppc64) +elseif(${MACHINE} STREQUAL "powerpc64le") + set(MACHINE ppc64le) endif() if(MOLD_ENABLE_QEMU_TESTS) list(APPEND QEMU_ARCHS x86_64 i386 arm aarch64 ppc ppc64 ppc64le sparc64 sh4 s390x - alpha riscv64 riscv32 m68k loongarch64) + riscv64 riscv32 m68k loongarch64) LIST(APPEND TRIPLES x86_64-linux-gnu @@ -45,7 +45,6 @@ if(MOLD_ENABLE_QEMU_TESTS) sparc64-linux-gnu s390x-linux-gnu sh4-linux-gnu - alpha-linux-gnu riscv32-linux-gnu m68k-linux-gnu loongarch64-linux-gnu) @@ -62,17 +61,17 @@ endif() function(add_target ARCH TRIPLE) set(CPU ${ARGV2}) - if(${ARCH} STREQUAL ${HOST}) + if(${ARCH} STREQUAL ${MACHINE}) set(IS_NATIVE 1) endif() file(GLOB ALL_TESTS RELATIVE ${CMAKE_CURRENT_LIST_DIR} CONFIGURE_DEPENDS "*.sh") - list(FILTER ALL_TESTS EXCLUDE REGEX "_") + list(FILTER ALL_TESTS EXCLUDE REGEX "^arch-") file(GLOB TESTS RELATIVE ${CMAKE_CURRENT_LIST_DIR} CONFIGURE_DEPENDS - "${ARCH}_*.sh") + "arch-${ARCH}-*.sh") list(APPEND TESTS ${ALL_TESTS}) @@ -88,7 +87,7 @@ function(add_target ARCH TRIPLE) WORKING_DIRECTORY ${mold_BINARY_DIR}) set_property(TEST ${TESTNAME} APPEND PROPERTY ENVIRONMENT - "HOST=${HOST};CPU=${CPU}") + "MACHINE=${MACHINE};CPU=${CPU}") if(IS_NATIVE) set_property(TEST ${TESTNAME} APPEND PROPERTY SKIP_REGULAR_EXPRESSION @@ -100,43 +99,43 @@ function(add_target ARCH TRIPLE) endforeach() endfunction() -if(${HOST} STREQUAL "x86_64" OR (HAS_qemu-x86_64 AND HAS_x86_64-linux-gnu-gcc)) +if(${MACHINE} STREQUAL "x86_64" OR (HAS_qemu-x86_64 AND HAS_x86_64-linux-gnu-gcc)) add_target(x86_64 x86_64-linux-gnu) endif() -if(${HOST} STREQUAL "i686" OR (HAS_qemu-i386 AND HAS_i686-linux-gnu-gcc)) +if(${MACHINE} STREQUAL "i686" OR (HAS_qemu-i386 AND HAS_i686-linux-gnu-gcc)) add_target(i686 i686-linux-gnu) endif() -if(${HOST} STREQUAL "aarch64" OR (HAS_qemu-aarch64 AND HAS_aarch64-linux-gnu-gcc)) +if(${MACHINE} STREQUAL "aarch64" OR (HAS_qemu-aarch64 AND HAS_aarch64-linux-gnu-gcc)) add_target(aarch64 aarch64-linux-gnu) endif() -if(${HOST} STREQUAL "arm" OR (HAS_qemu-arm AND HAS_arm-linux-gnueabihf-gcc)) +if(${MACHINE} STREQUAL "arm" OR (HAS_qemu-arm AND HAS_arm-linux-gnueabihf-gcc)) add_target(arm arm-linux-gnueabihf) endif() -if(${HOST} STREQUAL "riscv64" OR (HAS_qemu-riscv64 AND HAS_riscv64-linux-gnu-gcc)) +if(${MACHINE} STREQUAL "riscv64" OR (HAS_qemu-riscv64 AND HAS_riscv64-linux-gnu-gcc)) add_target(riscv64 riscv64-linux-gnu) endif() -if(${HOST} STREQUAL "riscv32" OR (HAS_qemu-riscv32 AND HAS_riscv32-linux-gnu-gcc)) +if(${MACHINE} STREQUAL "riscv32" OR (HAS_qemu-riscv32 AND HAS_riscv32-linux-gnu-gcc)) add_target(riscv32 riscv32-linux-gnu) endif() -if(${HOST} STREQUAL "ppc" OR (HAS_qemu-ppc AND HAS_powerpc-linux-gnu-gcc)) +if(${MACHINE} STREQUAL "ppc" OR (HAS_qemu-ppc AND HAS_powerpc-linux-gnu-gcc)) add_target(ppc powerpc-linux-gnu) endif() -if(${HOST} STREQUAL "ppc64" OR (HAS_qemu-ppc64 AND HAS_powerpc64-linux-gnu-gcc)) +if(${MACHINE} STREQUAL "ppc64" OR (HAS_qemu-ppc64 AND HAS_powerpc64-linux-gnu-gcc)) add_target(ppc64 powerpc64-linux-gnu) endif() -if(${HOST} STREQUAL "ppc64le" OR (HAS_qemu-ppc64le AND HAS_powerpc64le-linux-gnu-gcc)) +if(${MACHINE} STREQUAL "ppc64le" OR (HAS_qemu-ppc64le AND HAS_powerpc64le-linux-gnu-gcc)) add_target(ppc64le powerpc64le-linux-gnu) endif() -if(${HOST} STREQUAL "ppc64le" AND "${CPUINFO}" MATCHES "POWER10") +if(${MACHINE} STREQUAL "ppc64le" AND "${CPUINFO}" MATCHES "POWER10") add_target(ppc64le powerpc64le-linux-gnu power10) elseif(HAS_qemu-ppc64le AND HAS_powerpc64le-linux-gnu-gcc) file(WRITE "${CMAKE_BINARY_DIR}/empty.c" "") @@ -156,26 +155,22 @@ elseif(HAS_qemu-ppc64le AND HAS_powerpc64le-linux-gnu-gcc) endif() endif() -if(${HOST} STREQUAL "sparc64" OR (HAS_qemu-sparc64 AND HAS_sparc64-linux-gnu-gcc)) +if(${MACHINE} STREQUAL "sparc64" OR (HAS_qemu-sparc64 AND HAS_sparc64-linux-gnu-gcc)) add_target(sparc64 sparc64-linux-gnu) endif() -if(${HOST} STREQUAL "s390x" OR (HAS_qemu-s390x AND HAS_s390x-linux-gnu-gcc)) +if(${MACHINE} STREQUAL "s390x" OR (HAS_qemu-s390x AND HAS_s390x-linux-gnu-gcc)) add_target(s390x s390x-linux-gnu) endif() -if(${HOST} STREQUAL "sh4" OR (HAS_qemu-sh4 AND HAS_sh4-linux-gnu-gcc)) +if(${MACHINE} STREQUAL "sh4" OR (HAS_qemu-sh4 AND HAS_sh4-linux-gnu-gcc)) add_target(sh4 sh4-linux-gnu) endif() -if(${HOST} STREQUAL "alpha" OR (HAS_qemu-alpha AND HAS_alpha-linux-gnu-gcc)) - add_target(alpha alpha-linux-gnu) -endif() - -if(${HOST} STREQUAL "m68k" OR (HAS_qemu-m68k AND HAS_m68k-linux-gnu-gcc)) +if(${MACHINE} STREQUAL "m68k" OR (HAS_qemu-m68k AND HAS_m68k-linux-gnu-gcc)) add_target(m68k m68k-linux-gnu) endif() -if(${HOST} STREQUAL "loongarch64" OR (HAS_qemu-loongarch64 AND HAS_loongarch64-linux-gnu-gcc)) +if(${MACHINE} STREQUAL "loongarch64" OR (HAS_qemu-loongarch64 AND HAS_loongarch64-linux-gnu-gcc)) add_target(loongarch64 loongarch64-linux-gnu) endif() diff --git a/test/elf/abs-error.sh b/test/abs-error.sh similarity index 94% rename from test/elf/abs-error.sh rename to test/abs-error.sh index ca1cc1d7..65499c31 100755 --- a/test/elf/abs-error.sh +++ b/test/abs-error.sh @@ -5,7 +5,6 @@ [ $MACHINE = ppc64 ] && skip [ $MACHINE = ppc64le ] && skip [ $MACHINE = s390x ] && skip -[ $MACHINE = alpha ] && skip [[ $MACHINE = loongarch* ]] && skip cat < diff --git a/test/elf/aarch64_variant-pcs.sh b/test/arch-aarch64-variant-pcs.sh similarity index 93% rename from test/elf/aarch64_variant-pcs.sh rename to test/arch-aarch64-variant-pcs.sh index 6b5e4e8a..12361b79 100755 --- a/test/elf/aarch64_variant-pcs.sh +++ b/test/arch-aarch64-variant-pcs.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = aarch64 ] || skip - cat < /dev/null || skip .global foo .type foo, %function diff --git a/test/arch-arm-abs-error.sh b/test/arch-arm-abs-error.sh new file mode 100755 index 00000000..3a79c43c --- /dev/null +++ b/test/arch-arm-abs-error.sh @@ -0,0 +1,18 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < /dev/null || skip +#include +extern char foo; +int main() { printf("foo=%p\n", &foo); } +EOF + +$CC -o $t/exe -pie $t/a.o $t/b.o >& /dev/null && skip + +! $CC -B. -o $t/exe -pie $t/a.o $t/b.o >& $t/log +grep -q 'recompile with -fPIC' $t/log diff --git a/test/elf/arm_range-extension-thunk-disassembly.sh b/test/arch-arm-range-extension-thunk-disassembly.sh similarity index 94% rename from test/elf/arm_range-extension-thunk-disassembly.sh rename to test/arch-arm-range-extension-thunk-disassembly.sh index 9f3dcc59..b1f28d79 100755 --- a/test/elf/arm_range-extension-thunk-disassembly.sh +++ b/test/arch-arm-range-extension-thunk-disassembly.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = arm ] || skip - cat < diff --git a/test/elf/arm_range-extension-thunk.sh b/test/arch-arm-range-extension-thunk.sh similarity index 97% rename from test/elf/arm_range-extension-thunk.sh rename to test/arch-arm-range-extension-thunk.sh index ddf01313..191b1c51 100755 --- a/test/elf/arm_range-extension-thunk.sh +++ b/test/arch-arm-range-extension-thunk.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = arm ] || skip - echo 'int main() {}' | $CC -c -o /dev/null -xc - -O0 -mthumb >& /dev/null \ || skip diff --git a/test/elf/arm_thumb-interwork.sh b/test/arch-arm-thumb-interwork.sh similarity index 93% rename from test/elf/arm_thumb-interwork.sh rename to test/arch-arm-thumb-interwork.sh index baf8e04e..aed7b236 100755 --- a/test/elf/arm_thumb-interwork.sh +++ b/test/arch-arm-thumb-interwork.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[[ $MACHINE == arm* ]] || skip - echo 'int foo() { return 0; }' | $CC -o /dev/null -c -xc - -mthumb 2> /dev/null || skip cat <& /dev/null \ diff --git a/test/elf/i386_tls-module-base.sh b/test/arch-i686-tls-module-base.sh similarity index 97% rename from test/elf/i386_tls-module-base.sh rename to test/arch-i686-tls-module-base.sh index f0bc4fea..2e906ca7 100755 --- a/test/elf/i386_tls-module-base.sh +++ b/test/arch-i686-tls-module-base.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = i686 ] || skip - cat <<'EOF' | $CC -fPIC -o $t/a.o -c -xassembler - .globl get_foo .type get_foo, @function diff --git a/test/arch-i686-tlsdesc.sh b/test/arch-i686-tlsdesc.sh new file mode 100755 index 00000000..6363b380 --- /dev/null +++ b/test/arch-i686-tlsdesc.sh @@ -0,0 +1,48 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +supports_tlsdesc || skip + +cat <<'EOF' | $GCC -c -o $t/a.o -xassembler - +.globl get_foo +.type get_foo, @function +get_foo: + pushl %ebx + call __x86.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + subl $8, %esp + leal foo@TLSDESC(%ebx), %ebx + movl %ebx, %eax + call *foo@TLSCALL(%eax) + movl %gs:(%eax), %eax + addl $8, %esp + popl %ebx + ret +EOF + +cat < + +_Thread_local int foo; + +int get_foo(); + +int main() { + foo = 42; + printf("%d\n", get_foo()); +} +EOF + +$CC -B. -o $t/exe1 $t/a.o $t/b.o +$QEMU $t/exe1 | grep -q 42 + +$CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax +$QEMU $t/exe2 | grep -q 42 + +$CC -B. -shared -o $t/c.so $t/a.o +$CC -B. -o $t/exe3 $t/b.o $t/c.so +$QEMU $t/exe3 | grep -q 42 + +$CC -B. -shared -o $t/c.so $t/a.o -Wl,-no-relax +$CC -B. -o $t/exe4 $t/b.o $t/c.so -Wl,-no-relax +$QEMU $t/exe4 | grep -q 42 diff --git a/test/elf/loongarch64_mcmodel-extreme.sh b/test/arch-loongarch64-mcmodel-extreme.sh similarity index 100% rename from test/elf/loongarch64_mcmodel-extreme.sh rename to test/arch-loongarch64-mcmodel-extreme.sh diff --git a/test/arch-loongarch64-relax-call36.sh b/test/arch-loongarch64-relax-call36.sh new file mode 100755 index 00000000..34e40982 --- /dev/null +++ b/test/arch-loongarch64-relax-call36.sh @@ -0,0 +1,52 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat <<'EOF' | $CC -o $t/a.o -c -xassembler - +.globl foo, bar +.space 0x100000 +foo: + move $s0, $ra + .reloc ., R_LARCH_CALL36, print + .reloc ., R_LARCH_RELAX + pcaddu18i $t0, 0 + jirl $ra, $t0, 0 + move $ra, $s0 + ret +bar: + .reloc ., R_LARCH_CALL36, print + .reloc ., R_LARCH_RELAX + pcaddu18i $t0, 0 + jirl $zero, $t0, 0 +.space 0x100000 +EOF + +cat < + +void foo(); +void bar(); + +void print() { + printf("foo"); +} + +int main() { + foo(); + bar(); + printf("\n"); +} +EOF + +$CC -B. -o $t/exe1 $t/a.o $t/b.o -Wl,--no-relax +$QEMU $t/exe1 | grep -q foofoo + +$OBJDUMP -d $t/exe1 > $t/exe1.objdump +grep -A2 ':' $t/exe1.objdump | grep -wq pcaddu18i +grep -A2 ':' $t/exe1.objdump | grep -wq pcaddu18i + +$CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,--relax +$QEMU $t/exe2 | grep -q foofoo + +$OBJDUMP -d $t/exe2 > $t/exe2.objdump +grep -A2 ':' $t/exe2.objdump | grep -wq bl +grep -A2 ':' $t/exe2.objdump | grep -wq b diff --git a/test/arch-loongarch64-relax-got-load.sh b/test/arch-loongarch64-relax-got-load.sh new file mode 100755 index 00000000..279fa8b5 --- /dev/null +++ b/test/arch-loongarch64-relax-got-load.sh @@ -0,0 +1,33 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < +int get_foo(); +int main() { printf("%d\n", get_foo()); } +EOF + +$CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -pie -Wl,--no-relax +$QEMU $t/exe1 | grep -q '^3$' +$OBJDUMP -d $t/exe1 | grep -A2 ':' | grep -Fqw pcalau12i +$OBJDUMP -d $t/exe1 | grep -A2 ':' | grep -Fqw ld.d + +$CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -pie -Wl,--relax +$QEMU $t/exe2 | grep -q '^3$' +$OBJDUMP -d $t/exe2 | grep -A1 ':' | grep -Fqw pcaddi + +$CC -B. -o $t/exe3 $t/a.o $t/b.o $t/c.o -pie -Wl,--relax \ + -Wl,-Ttext=0x1000000,-Tdata=0x2000000 + +$QEMU $t/exe3 | grep -q '^3$' +$OBJDUMP -d $t/exe3 | grep -A2 ':' | grep -Fqw pcalau12i +$OBJDUMP -d $t/exe3 | grep -A2 ':' | grep -Fqw addi.d diff --git a/test/arch-loongarch64-relax-pcala-addi.sh b/test/arch-loongarch64-relax-pcala-addi.sh new file mode 100755 index 00000000..fe26c73c --- /dev/null +++ b/test/arch-loongarch64-relax-pcala-addi.sh @@ -0,0 +1,58 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat <<'EOF' | $CC -o $t/a.o -c -xassembler - +.globl get_sym1, get_sym2, get_sym3 +get_sym1: + la.pcrel $a0, sym1 + ret +get_sym2: + la.pcrel $a0, sym2 + ret +get_sym3: + la.pcrel $a0, sym3 + ret +EOF + +cat <<'EOF' | $CC -o $t/b.o -c -xassembler - +.globl sym1, sym2, sym3 +sym1: + li.d $a0, 1 + ret +.space 1024 * 1024 +sym2: + li.d $a0, 2 + ret +.space 1024 * 1024 +sym3: + li.d $a0, 3 + ret +EOF + +cat < + +int (*get_sym1())(); +int (*get_sym2())(); +int (*get_sym3())(); + +int main() { + printf("%d %d %d\n", get_sym1()(), get_sym2()(), get_sym3()()); +} +EOF + +$CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax +$QEMU $t/exe1 | grep -q '^1 2 3$' + +$OBJDUMP -d $t/exe1 > $t/exe1.objdump +grep -A1 ':' $t/exe1.objdump | grep -q pcalau12i +grep -A1 ':' $t/exe1.objdump | grep -q pcalau12i +grep -A1 ':' $t/exe1.objdump | grep -q pcalau12i + +$CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -Wl,--relax +$QEMU $t/exe2 | grep -q '^1 2 3$' + +$OBJDUMP -d $t/exe2 > $t/exe2.objdump +grep -A1 ':' $t/exe2.objdump | grep -q pcaddi +grep -A1 ':' $t/exe2.objdump | grep -q pcaddi +grep -A1 ':' $t/exe2.objdump | grep -q pcalau12i diff --git a/test/arch-loongarch64-relax-tlsdesc.sh b/test/arch-loongarch64-relax-tlsdesc.sh new file mode 100755 index 00000000..37b44715 --- /dev/null +++ b/test/arch-loongarch64-relax-tlsdesc.sh @@ -0,0 +1,43 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat <<'EOF' | $CC -o $t/a.o -c -xc - -fPIC +_Thread_local char foo[4] = "foo"; +_Thread_local char padding[100000] = "padding"; +EOF + +cat <<'EOF' | $CC -o $t/b.o -c -xc - -fPIC +_Thread_local char bar[4] = "bar"; +EOF + +cat <<'EOF' | $CC -o $t/c.o -c -xc - -fPIC -mtls-dialect=desc -O2 +extern _Thread_local char foo[4]; +extern _Thread_local char bar[4]; + +char *get_foo() { return foo; } +char *get_bar() { return bar; } +EOF + +cat < +char *get_foo(); +char *get_bar(); + +int main() { + printf("%s %s\n", get_foo(), get_bar()); +} +EOF + +$CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--no-relax +$QEMU $t/exe1 | grep -q 'foo bar' + +$OBJDUMP -d $t/exe1 > $t/exe1.objdump +grep -A6 ':' $t/exe1.objdump | grep -Fq pcalau12i +grep -A6 ':' $t/exe1.objdump | grep -Fq pcalau12i + +$CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--relax +$QEMU $t/exe2 | grep -q 'foo bar' + +$OBJDUMP -d $t/exe2 > $t/exe2.objdump +grep -A6 ':' $t/exe2.objdump | grep -Fq li.w +grep -A6 ':' $t/exe2.objdump | grep -Fq lu12i.w diff --git a/test/elf/package-metadata.sh b/test/arch-ppc64le-save-restore-gprs.sh similarity index 53% rename from test/elf/package-metadata.sh rename to test/arch-ppc64le-save-restore-gprs.sh index 4c673bc5..2a2fd101 100755 --- a/test/elf/package-metadata.sh +++ b/test/arch-ppc64le-save-restore-gprs.sh @@ -8,5 +8,5 @@ int main() { } EOF -$CC -B. -o $t/exe $t/a.o -Wl,-package-metadata='{"foo":"bar"}' -readelf -x .note.package $t/exe | grep -Fq '{"foo":"bar"}' +$CC -B. -o $t/exe $t/a.o +$OBJDUMP -d $t/exe | grep -q '<_savegpr0_14>' diff --git a/test/elf/riscv64_attributes.sh b/test/arch-riscv64-attributes.sh similarity index 100% rename from test/elf/riscv64_attributes.sh rename to test/arch-riscv64-attributes.sh diff --git a/test/elf/riscv64_attributes2.sh b/test/arch-riscv64-attributes2.sh similarity index 100% rename from test/elf/riscv64_attributes2.sh rename to test/arch-riscv64-attributes2.sh diff --git a/test/arch-riscv64-global-pointer-dso.sh b/test/arch-riscv64-global-pointer-dso.sh new file mode 100755 index 00000000..7f6fef37 --- /dev/null +++ b/test/arch-riscv64-global-pointer-dso.sh @@ -0,0 +1,27 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < + +int main() { + printf("Hello world\n"); +} +EOF + +$CC -B. -o $t/exe $t/b.so $t/c.o $t/d.o +$QEMU $t/exe | grep -q 'Hello world' diff --git a/test/arch-riscv64-global-pointer.sh b/test/arch-riscv64-global-pointer.sh new file mode 100755 index 00000000..b184e8d5 --- /dev/null +++ b/test/arch-riscv64-global-pointer.sh @@ -0,0 +1,26 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < +int main() { + printf("Hello world\n"); +} +EOF + +$CC -B. -o $t/exe1 $t/a.o -fno-PIE +readelf -W --dyn-syms $t/exe1 | grep -Fq '__global_pointer$' + +$CC -B. -o $t/exe2 $t/a.o -fPIE +readelf -W --dyn-syms $t/exe2 | grep -Fq '__global_pointer$' + +cat < +int hello() { + printf("Hello world\n"); +} +EOF + +$CC -B. -o $t/c.so $t/b.o -shared +readelf -W --dyn-syms $t/c.so > $t/log1 +! grep -Fq '__global_pointer$' $t/log1 || false diff --git a/test/elf/riscv64_norvc.sh b/test/arch-riscv64-norvc.sh similarity index 91% rename from test/elf/riscv64_norvc.sh rename to test/arch-riscv64-norvc.sh index c679f64b..1383d4e0 100755 --- a/test/elf/riscv64_norvc.sh +++ b/test/arch-riscv64-norvc.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = riscv64 -o $MACHINE = riscv32 ] || skip - # Disable C extension if [ $MACHINE = riscv32 ]; then ISA=rv32g diff --git a/test/elf/riscv64_obj-compatible.sh b/test/arch-riscv64-obj-compatible.sh similarity index 100% rename from test/elf/riscv64_obj-compatible.sh rename to test/arch-riscv64-obj-compatible.sh diff --git a/test/arch-riscv64-relax-got.sh b/test/arch-riscv64-relax-got.sh new file mode 100755 index 00000000..14e33355 --- /dev/null +++ b/test/arch-riscv64-relax-got.sh @@ -0,0 +1,79 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < + +int get_sym1(); +int get_sym2(); +int get_sym3(); +int get_sym4(); +int get_sym5(); + +int main() { + printf("%x %x %x %x %x\n", + get_sym1(), get_sym2(), get_sym3(), get_sym4(), get_sym5()); +} +EOF + +$CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax +$QEMU $t/exe1 | grep -Eq '^0 ba beef 11beef deadbeef$' + +$CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o +$QEMU $t/exe2 | grep -Eq '^0 ba beef 11beef deadbeef$' + +$OBJDUMP -d $t/exe2 | grep -A2 ':' | grep -Eq $'li[ \t]+a0,186$' diff --git a/test/elf/riscv64_relax-hi20.sh b/test/arch-riscv64-relax-hi20.sh similarity index 92% rename from test/elf/riscv64_relax-hi20.sh rename to test/arch-riscv64-relax-hi20.sh index 729ec063..fb4774eb 100755 --- a/test/elf/riscv64_relax-hi20.sh +++ b/test/arch-riscv64-relax-hi20.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = riscv64 -o $MACHINE = riscv32 ] || skip - cat < +void *foo(); +void bar(); +int main() { printf("%d %p %p\n", foo() == bar, foo(), bar); } +EOF + +cat <& /dev/null; then +if test_cxxflags -static; then $CXX -B. -o $t/exe $t/a.o -static -mcmodel=large $QEMU $t/exe fi diff --git a/test/elf/x86_64_execstack-if-needed.sh b/test/arch-x86_64-execstack-if-needed.sh similarity index 92% rename from test/elf/x86_64_execstack-if-needed.sh rename to test/arch-x86_64-execstack-if-needed.sh index 5af2b5fa..3fae6c76 100755 --- a/test/elf/x86_64_execstack-if-needed.sh +++ b/test/arch-x86_64-execstack-if-needed.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = x86_64 ] || skip - cat < /dev/null || skip cat <& /dev/null || skip + +mkdir -p $t/foo + +echo 'char hello[] = "Hello world";' | $CC -shared -o $t/libbar.so -m32 -xc - +echo 'char hello[] = "Hello world";' | $CC -shared -o $t/foo/libbar.so -xc - + +cat < +extern char hello[]; +int main() { + printf("%s\n", hello); +} +EOF + +cat < $t/b.script +INPUT(libbar.so) +EOF + +cd $t + +$CC -B$OLDPWD -o exe1 -Lfoo a.o b.script +LD_LIBRARY_PATH=. $QEMU ./exe1 | grep -q 'Hello world' + +$CC -B$OLDPWD -o exe2 -Lfoo b.script a.o +LD_LIBRARY_PATH=. $QEMU ./exe2 | grep -q 'Hello world' diff --git a/test/arch-x86_64-incompatible-libs-linker-script2.sh b/test/arch-x86_64-incompatible-libs-linker-script2.sh new file mode 100755 index 00000000..3630692c --- /dev/null +++ b/test/arch-x86_64-incompatible-libs-linker-script2.sh @@ -0,0 +1,32 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +nm mold | grep -q '__tsan_init' && skip +echo 'int main() {}' | $CC -m32 -o $t/exe -xc - >& /dev/null || skip + +mkdir -p $t/foo + +cat < +extern char hello[]; +int main() { + printf("%s\n", hello); +} +EOF + +cat < $t/d.script +INPUT(a.o) +EOF + +cd $t + +$OLDPWD/ld -o e.o -r -Lfoo d.script c.o +$OLDPWD/ld -o f.o -r -Lfoo c.o d.script diff --git a/test/elf/x86_64_incompatible-libs.sh b/test/arch-x86_64-incompatible-libs.sh similarity index 96% rename from test/elf/x86_64_incompatible-libs.sh rename to test/arch-x86_64-incompatible-libs.sh index c6224139..3661d82b 100755 --- a/test/elf/x86_64_incompatible-libs.sh +++ b/test/arch-x86_64-incompatible-libs.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = x86_64 ] || skip - echo 'int main() {}' | $CC -m32 -o $t/exe -xc - >& /dev/null \ || skip diff --git a/test/elf/x86_64_incompatible-libs2.sh b/test/arch-x86_64-incompatible-libs2.sh similarity index 96% rename from test/elf/x86_64_incompatible-libs2.sh rename to test/arch-x86_64-incompatible-libs2.sh index 549d7f51..167045ea 100755 --- a/test/elf/x86_64_incompatible-libs2.sh +++ b/test/arch-x86_64-incompatible-libs2.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = x86_64 ] || skip - echo 'int main() {}' | $CC -m32 -o $t/exe -xc - >& /dev/null \ || skip diff --git a/test/elf/x86_64_incompatible-obj.sh b/test/arch-x86_64-incompatible-obj.sh similarity index 91% rename from test/elf/x86_64_incompatible-obj.sh rename to test/arch-x86_64-incompatible-obj.sh index 472c0af3..9f073a97 100755 --- a/test/elf/x86_64_incompatible-obj.sh +++ b/test/arch-x86_64-incompatible-obj.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = x86_64 ] || skip - echo 'int main() {}' | $CC -m32 -o $t/exe -xc - >& /dev/null \ || skip diff --git a/test/elf/x86_64_init-array-readonly.sh b/test/arch-x86_64-init-array-readonly.sh similarity index 95% rename from test/elf/x86_64_init-array-readonly.sh rename to test/arch-x86_64-init-array-readonly.sh index f4cdbcd3..2e6c41f2 100755 --- a/test/elf/x86_64_init-array-readonly.sh +++ b/test/arch-x86_64-init-array-readonly.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = x86_64 ] || skip - cat < char arr1[0xc0000000]; diff --git a/test/elf/x86_64_mergeable-records.sh b/test/arch-x86_64-mergeable-records.sh similarity index 93% rename from test/elf/x86_64_mergeable-records.sh rename to test/arch-x86_64-mergeable-records.sh index 4b82d9b1..62408984 100755 --- a/test/elf/x86_64_mergeable-records.sh +++ b/test/arch-x86_64-mergeable-records.sh @@ -1,9 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -# Skip if target is not x86-64 -[ $MACHINE = x86_64 ] || skip - test_cflags -static || skip cat <<'EOF' | $CC -o $t/a.o -c -x assembler - diff --git a/test/arch-x86_64-mergeable-strings-nonalloc.sh b/test/arch-x86_64-mergeable-strings-nonalloc.sh new file mode 100755 index 00000000..3d817d36 --- /dev/null +++ b/test/arch-x86_64-mergeable-strings-nonalloc.sh @@ -0,0 +1,23 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat <<'EOF' | $CC -o $t/a.o -c -xc - +int main() {} +EOF + +cat <<'EOF' | $CC -o $t/b.o -c -x assembler - +.section .foo, "", @progbits +.quad .L1 - 1 +.quad .L2 - 1 + +.section .bar, "MS", @progbits, 1 +.L1: + .string "abc" +.L2: + .string "xyz" +EOF + +$CC -B. -o $t/exe $t/a.o $t/b.o + +readelf -x .foo $t/exe | grep -Fq '03000000 00000000 ffffffff ffffffff' +readelf -x .bar $t/exe | grep -Fq 'xyz.abc.' diff --git a/test/elf/x86_64_mergeable-strings.sh b/test/arch-x86_64-mergeable-strings.sh similarity index 89% rename from test/elf/x86_64_mergeable-strings.sh rename to test/arch-x86_64-mergeable-strings.sh index 9f21176d..5d6148c2 100755 --- a/test/elf/x86_64_mergeable-strings.sh +++ b/test/arch-x86_64-mergeable-strings.sh @@ -3,9 +3,6 @@ test_cflags -static || skip -# Skip if target is not x86-64 -[ $MACHINE = x86_64 ] || skip - cat <<'EOF' | $CC -o $t/a.o -c -x assembler - .text .globl main diff --git a/test/elf/x86_64_note-property.sh b/test/arch-x86_64-note-property.sh similarity index 89% rename from test/elf/x86_64_note-property.sh rename to test/arch-x86_64-note-property.sh index 9a326d91..55286b17 100755 --- a/test/elf/x86_64_note-property.sh +++ b/test/arch-x86_64-note-property.sh @@ -1,9 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -# Skip if target is not x86-64 -[ $MACHINE = x86_64 ] || skip - echo endbr64 | $CC -o /dev/null -c -xassembler - 2> /dev/null || skip $CC -fcf-protection=branch -c /dev/null -o /dev/null -xc 2> /dev/null || skip diff --git a/test/elf/x86_64_note-property2.sh b/test/arch-x86_64-note-property2.sh similarity index 97% rename from test/elf/x86_64_note-property2.sh rename to test/arch-x86_64-note-property2.sh index 1a240f87..903de331 100755 --- a/test/elf/x86_64_note-property2.sh +++ b/test/arch-x86_64-note-property2.sh @@ -4,9 +4,6 @@ # OneTBB isn't tsan-clean nm mold | grep -q '__tsan_init' && skip -# Skip if target is not x86-64 -[ $MACHINE = x86_64 ] || skip - # Binutils 2.32 injects their own .note.gnu.property section interfering with the tests test_cflags -Xassembler -mx86-used-note=no && CFLAGS="-Xassembler -mx86-used-note=no" || CFLAGS="" diff --git a/test/elf/x86_64_note.sh b/test/arch-x86_64-note.sh similarity index 91% rename from test/elf/x86_64_note.sh rename to test/arch-x86_64-note.sh index 7f951cdf..51aa68d4 100755 --- a/test/elf/x86_64_note.sh +++ b/test/arch-x86_64-note.sh @@ -3,8 +3,6 @@ test_cflags -static || skip -[ $MACHINE = x86_64 ] || skip - # Binutils 2.32 injects their own .note.gnu.property section interfering with the tests test_cflags -Xassembler -mx86-used-note=no && CFLAGS="-Xassembler -mx86-used-note=no" || CFLAGS="" @@ -39,5 +37,5 @@ grep -Eq '.note.baz\s+NOTE.+000008 00 A 0 0 8' $t/log grep -Eq '.note.nonalloc\s+NOTE.+000008 00 0 0 1' $t/log readelf --segments $t/exe > $t/log -grep -Fq '01 .note.baz .note.foo .note.bar' $t/log +grep -Fq '01 .note.bar .note.baz .note.foo' $t/log ! grep -q 'NOTE.*0x0000000000000000 0x0000000000000000' $t/log || false diff --git a/test/elf/x86_64_note2.sh b/test/arch-x86_64-note2.sh similarity index 89% rename from test/elf/x86_64_note2.sh rename to test/arch-x86_64-note2.sh index feba6728..e2bb3036 100755 --- a/test/elf/x86_64_note2.sh +++ b/test/arch-x86_64-note2.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = x86_64 ] || skip - # Binutils 2.32 injects their own .note.gnu.property section interfering with the tests test_cflags -Xassembler -mx86-used-note=no && CFLAGS="-Xassembler -mx86-used-note=no" || CFLAGS="" @@ -31,4 +29,4 @@ EOF ./mold -o $t/exe $t/a.o $t/b.o $t/c.o $t/d.o readelf --segments $t/exe > $t/log -grep -Fq '01 .note.a .note.c .note.b' $t/log +grep -Fq '01 .note.a .note.b .note.c' $t/log diff --git a/test/elf/x86_64_plt.sh b/test/arch-x86_64-plt.sh similarity index 93% rename from test/elf/x86_64_plt.sh rename to test/arch-x86_64-plt.sh index 32b61c4c..b8bddddf 100755 --- a/test/elf/x86_64_plt.sh +++ b/test/arch-x86_64-plt.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = x86_64 ] || skip - cat <<'EOF' | $CC -o $t/a.o -c -x assembler - .text .globl main diff --git a/test/elf/x86_64_preinit-array.sh b/test/arch-x86_64-preinit-array.sh similarity index 95% rename from test/elf/x86_64_preinit-array.sh rename to test/arch-x86_64-preinit-array.sh index 6b61bdb5..bb326e40 100755 --- a/test/elf/x86_64_preinit-array.sh +++ b/test/arch-x86_64-preinit-array.sh @@ -3,8 +3,6 @@ is_musl && skip -[ $MACHINE = x86_64 ] || skip - cat < /dev/null || skip cat < #include diff --git a/test/elf/x86_64_section-name.sh b/test/arch-x86_64-section-name.sh similarity index 98% rename from test/elf/x86_64_section-name.sh rename to test/arch-x86_64-section-name.sh index 31f9b480..8fa06e90 100755 --- a/test/elf/x86_64_section-name.sh +++ b/test/arch-x86_64-section-name.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = x86_64 ] || skip - cat <<'EOF' | $CC -o $t/a.o -c -x assembler - .globl _start .text diff --git a/test/arch-x86_64-tbss-only.sh b/test/arch-x86_64-tbss-only.sh new file mode 100755 index 00000000..6ebdb453 --- /dev/null +++ b/test/arch-x86_64-tbss-only.sh @@ -0,0 +1,19 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +# Test if grep supports backreferences +echo abab | grep -Eq '(ab)\1' || skip + +cat < @@ -27,7 +25,6 @@ static _Thread_local int x5 = 5; int get_x5() { return x5; } EOF - cat < diff --git a/test/elf/x86_64_tls-module-base.sh b/test/arch-x86_64-tls-module-base.sh similarity index 96% rename from test/elf/x86_64_tls-module-base.sh rename to test/arch-x86_64-tls-module-base.sh index d4fec306..830f0297 100755 --- a/test/elf/x86_64_tls-module-base.sh +++ b/test/arch-x86_64-tls-module-base.sh @@ -1,7 +1,7 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = x86_64 ] || skip +supports_tlsdesc || skip cat < + +_Thread_local int foo; + +int get_foo(); + +int main() { + foo = 42; + printf("%d\n", get_foo()); +} +EOF + +$CC -B. -o $t/exe1 $t/a.o $t/b.o +$QEMU $t/exe1 | grep -q 42 + +$CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax +$QEMU $t/exe2 | grep -q 42 + +$CC -B. -shared -o $t/c.so $t/a.o +$CC -B. -o $t/exe3 $t/b.o $t/c.so +$QEMU $t/exe3 | grep -q 42 + +$CC -B. -shared -o $t/c.so $t/a.o -Wl,-no-relax +$CC -B. -o $t/exe4 $t/b.o $t/c.so -Wl,-no-relax +$QEMU $t/exe4 | grep -q 42 diff --git a/test/elf/x86_64_unique.sh b/test/arch-x86_64-unique.sh similarity index 94% rename from test/elf/x86_64_unique.sh rename to test/arch-x86_64-unique.sh index 0d967ce0..ecbcda1b 100755 --- a/test/elf/x86_64_unique.sh +++ b/test/arch-x86_64-unique.sh @@ -1,8 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = x86_64 ] || skip - cat <&1 | grep -q 'may cause a segmentation fault' +$GCC -B. -o $t/exe $t/a.o $t/b.o 2>&1 | grep -Eq 'may cause a segmentation fault|requires executable stack' diff --git a/test/elf/x86_64_warn-shared-textrel.sh b/test/arch-x86_64-warn-shared-textrel.sh similarity index 88% rename from test/elf/x86_64_warn-shared-textrel.sh rename to test/arch-x86_64-warn-shared-textrel.sh index 1f42a833..3c5a6da9 100755 --- a/test/elf/x86_64_warn-shared-textrel.sh +++ b/test/arch-x86_64-warn-shared-textrel.sh @@ -4,9 +4,6 @@ # Skip if libc is musl is_musl && skip -# Skip if target is not x86-64 -[ $MACHINE = x86_64 ] || skip - cat <<'EOF' | $CC -c -o $t/a.o -x assembler - .globl fn fn: diff --git a/test/elf/x86_64_warn-textrel.sh b/test/arch-x86_64-warn-textrel.sh similarity index 87% rename from test/elf/x86_64_warn-textrel.sh rename to test/arch-x86_64-warn-textrel.sh index bf53c91d..031cdcdf 100755 --- a/test/elf/x86_64_warn-textrel.sh +++ b/test/arch-x86_64-warn-textrel.sh @@ -4,9 +4,6 @@ # Skip if libc is musl is_musl && skip -# Skip if target is not x86-64 -[ $MACHINE = x86_64 ] || skip - cat <<'EOF' | $CC -c -o $t/a.o -x assembler - .globl fn fn: diff --git a/test/elf/x86_64_z-ibt.sh b/test/arch-x86_64-z-ibt.sh similarity index 92% rename from test/elf/x86_64_z-ibt.sh rename to test/arch-x86_64-z-ibt.sh index d1dbcd97..9bb5f066 100755 --- a/test/elf/x86_64_z-ibt.sh +++ b/test/arch-x86_64-z-ibt.sh @@ -1,7 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = x86_64 ] || skip echo endbr64 | $CC -o /dev/null -c -xassembler - 2> /dev/null || skip cat < void hello() { printf("Hello"); } diff --git a/test/elf/x86_64_endbr.sh b/test/arch-x86_64-z-rewrite-endbr.sh similarity index 95% rename from test/elf/x86_64_endbr.sh rename to test/arch-x86_64-z-rewrite-endbr.sh index 1efab0e6..0a04ffdc 100755 --- a/test/elf/x86_64_endbr.sh +++ b/test/arch-x86_64-z-rewrite-endbr.sh @@ -1,7 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = x86_64 ] || skip test_cflags -fcf-protection || skip cat < $t/log1 + +grep -A1 ':' $t/log1 | grep -q endbr64 +grep -A1 ':' $t/log1 | grep -q endbr64 +grep -A1 '
:' $t/log1 | grep -q endbr64 + +$CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-z,rewrite-endbr +$OBJDUMP -dr $t/exe2 > $t/log2 + +grep -A1 ':' $t/log2 | grep -q nop +grep -A1 ':' $t/log2 | grep -q nop +grep -A1 '
:' $t/log2 | grep -q endbr64 diff --git a/test/arch-x86_64-z-rewrite-endbr3.sh b/test/arch-x86_64-z-rewrite-endbr3.sh new file mode 100755 index 00000000..f8358542 --- /dev/null +++ b/test/arch-x86_64-z-rewrite-endbr3.sh @@ -0,0 +1,19 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +test_cflags -fcf-protection || skip +[ "$QEMU" == '' ] || skip + +# Check if Intel SDE CPU emulator is available +command -v sde >& /dev/null || skip +sde --help | grep -q 'Software Development Emulator' || skip + +cat < +int main() { + printf("Hello world\n"); +} +EOF + +$CC -B. -o $t/exe $t/a.o -Wl,-z,rewrite-endbr +sde -cet 1 -- $t/exe | grep -q 'Hello world' diff --git a/test/elf/x86_64_z-shstk.sh b/test/arch-x86_64-z-shstk.sh similarity index 91% rename from test/elf/x86_64_z-shstk.sh rename to test/arch-x86_64-z-shstk.sh index 872f60a5..42c51439 100755 --- a/test/elf/x86_64_z-shstk.sh +++ b/test/arch-x86_64-z-shstk.sh @@ -1,7 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = x86_64 ] || skip echo endbr64 | $CC -o /dev/null -c -xassembler - 2> /dev/null || skip cat < $t/log2 grep -q libbar $t/log2 -! grep -q libfoo $t/log2 || false +grep -q libfoo $t/log2 diff --git a/test/elf/as-needed-dso2.sh b/test/as-needed-dso2.sh similarity index 100% rename from test/elf/as-needed-dso2.sh rename to test/as-needed-dso2.sh diff --git a/test/elf/as-needed-weak.sh b/test/as-needed-weak.sh similarity index 77% rename from test/elf/as-needed-weak.sh rename to test/as-needed-weak.sh index 112561fc..fc432300 100755 --- a/test/elf/as-needed-weak.sh +++ b/test/as-needed-weak.sh @@ -18,14 +18,14 @@ cat < $t/log1 grep -Fq 'Shared library: [libfoo.so]' $t/log1 grep -Fq 'Shared library: [libbar.so]' $t/log1 -$CC -o $t/exe2 $t/a.o -Wl,-as-needed -L$t -lbar -lfoo +$CC -B. -o $t/exe2 $t/a.o -Wl,-as-needed -L$t -lbar -lfoo readelf --dynamic $t/exe2 > $t/log2 -! grep -Fq 'Shared library: [libfoo.so]' $t/log2 || false +grep -Fq 'Shared library: [libfoo.so]' $t/log2 ! grep -Fq 'Shared library: [libbar.so]' $t/log2 || false diff --git a/test/elf/as-needed.sh b/test/as-needed.sh similarity index 60% rename from test/elf/as-needed.sh rename to test/as-needed.sh index b0389c27..6d5448c8 100755 --- a/test/elf/as-needed.sh +++ b/test/as-needed.sh @@ -18,12 +18,12 @@ EOF $CC -B. -o $t/exe $t/a.o -Wl,--no-as-needed $t/b.so $t/c.so -readelf --dynamic $t/exe > $t/readelf -grep -Fq 'Shared library: [libfoo.so]' $t/readelf -grep -Fq 'Shared library: [libbar.so]' $t/readelf +readelf --dynamic $t/exe > $t/log +grep -Fq 'Shared library: [libfoo.so]' $t/log +grep -Fq 'Shared library: [libbar.so]' $t/log $CC -B. -o $t/exe $t/a.o -Wl,--as-needed $t/b.so $t/c.so -readelf --dynamic $t/exe > $t/readelf -grep -Fq 'Shared library: [libfoo.so]' $t/readelf -! grep -Fq 'Shared library: [libbar.so]' $t/readelf || false +readelf --dynamic $t/exe > $t/log +grep -Fq 'Shared library: [libfoo.so]' $t/log +! grep -Fq 'Shared library: [libbar.so]' $t/log || false diff --git a/test/elf/auxiliary.sh b/test/auxiliary.sh similarity index 100% rename from test/elf/auxiliary.sh rename to test/auxiliary.sh diff --git a/test/elf/bno-symbolic.sh b/test/bno-symbolic.sh similarity index 97% rename from test/elf/bno-symbolic.sh rename to test/bno-symbolic.sh index e577bc08..213a1cc8 100755 --- a/test/elf/bno-symbolic.sh +++ b/test/bno-symbolic.sh @@ -3,7 +3,7 @@ # GCC produces buggy code for this test case on s390x. # https://sourceware.org/bugzilla/show_bug.cgi?id=29655 -[ $MACHINE = s390x ] && $CC -v 2>&1 | grep -E '^gcc version 1[0-3]\.' && skip +[ $MACHINE = s390x ] && $CC -v 2>&1 | grep -E '^gcc version 1[0-4]\.' && skip cat < + +int foo = 3; +int bar = 3; +int baz = 3; + +int get_foo1() { return 7; } +int get_bar1() { return 7; } +int get_baz1() { return 7; } + +int get_foo2(); +int get_bar2(); +int get_baz2(); + +int main() { + printf("%d %d %d %d %d %d\n", foo, bar, baz, + get_foo2(), get_bar2(), get_baz2()); +} +EOF + +$CC -B. -o $t/exe $t/c.o $t/b.so +$QEMU $t/exe | grep -q '^3 3 3 3 3 7$' diff --git a/test/bsymbolic-non-weak.sh b/test/bsymbolic-non-weak.sh new file mode 100755 index 00000000..284a9970 --- /dev/null +++ b/test/bsymbolic-non-weak.sh @@ -0,0 +1,42 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < + +int foo = 3; +int bar = 3; +int baz = 3; + +int get_foo1() { return 7; } +int get_bar1() { return 7; } +int get_baz1() { return 7; } + +int get_foo2(); +int get_bar2(); +int get_baz2(); + +int main() { + printf("%d %d %d %d %d %d\n", foo, bar, baz, + get_foo2(), get_bar2(), get_baz2()); +} +EOF + +$CC -B. -o $t/exe $t/c.o $t/b.so +$QEMU $t/exe | grep -q '^3 3 3 3 4 7$' diff --git a/test/elf/bsymbolic.sh b/test/bsymbolic.sh similarity index 100% rename from test/elf/bsymbolic.sh rename to test/bsymbolic.sh diff --git a/test/elf/build-id.sh b/test/build-id.sh similarity index 87% rename from test/elf/build-id.sh rename to test/build-id.sh index acff861e..d2310925 100755 --- a/test/elf/build-id.sh +++ b/test/build-id.sh @@ -18,5 +18,8 @@ readelf -n $t/exe | grep -q 'GNU.*0x00000014.*NT_GNU_BUILD_ID' $CC -B. -o $t/exe $t/a.c -Wl,-build-id=sha256 readelf -n $t/exe | grep -q 'GNU.*0x00000020.*NT_GNU_BUILD_ID' +$CC -B. -o $t/exe $t/a.c -Wl,-build-id=fast +readelf -n $t/exe | grep -q 'GNU.*0x00000020.*NT_GNU_BUILD_ID' + $CC -B. -o $t/exe $t/a.c -Wl,-build-id=0xdeadbeefdeadbeef readelf -n $t/exe | grep -q 'Build ID: deadbeefdeadbeef' diff --git a/test/elf/canonical-plt.sh b/test/canonical-plt.sh similarity index 98% rename from test/elf/canonical-plt.sh rename to test/canonical-plt.sh index b736411f..53188e0e 100755 --- a/test/elf/canonical-plt.sh +++ b/test/canonical-plt.sh @@ -3,7 +3,7 @@ # GCC produces buggy code for this test case on s390x. # https://sourceware.org/bugzilla/show_bug.cgi?id=29655 -[ $MACHINE = s390x ] && $CC -v 2>&1 | grep -E '^gcc version 1[0-3]\.' && skip +[ $MACHINE = s390x ] && $CC -v 2>&1 | grep -E '^gcc version 1[0-4]\.' && skip cat <& /dev/null + echo 'int main() {}' | $CC -B. "$@" -o /dev/null -xc - >& /dev/null } -supports_ifunc() { - echo 'void x() __attribute__((ifunc("y"))); void *y() { return 0; }' | \ - $CC -c -o /dev/null -xc - >& /dev/null +test_cxxflags() { + echo 'int main() {}' | $CXX -B. "$@" -o /dev/null -xc++ - >& /dev/null } is_musl() { - ldd --help 2>&1 | grep -q musl + ldd --version 2>&1 | grep -q musl +} + +supports_ifunc() { + ! is_musl && \ + echo 'void x() __attribute__((ifunc("y"))); void *y() { return 0; }' | \ + $CC -c -o /dev/null -xc - >& /dev/null } supports_tlsdesc() { # musl's tlsdesc on arm32 seems to be broken [ $MACHINE = arm ] && is_musl && return 1 - [ -n "$tlsdesc_opt" ] + # FreeBSD's loader doesn't seem to support TLSDESC relocs in an executable + [ "$(uname)" = FreeBSD ] && return 1 + + [ "$tlsdesc_opt" != '' ] +} + +on_qemu() { + [ "$QEMU" != '' ] || grep -qw qemu /proc/cpuinfo 2> /dev/null } skip() { @@ -112,3 +131,4 @@ testname=$(basename "$0" .sh) echo -n "Testing $testname ... " t=$TESTDIR/$testname mkdir -p $t +set -x diff --git a/test/elf/compress-debug-sections-zstd.sh b/test/compress-debug-sections-zstd.sh similarity index 100% rename from test/elf/compress-debug-sections-zstd.sh rename to test/compress-debug-sections-zstd.sh diff --git a/test/elf/compress-debug-sections.sh b/test/compress-debug-sections.sh similarity index 100% rename from test/elf/compress-debug-sections.sh rename to test/compress-debug-sections.sh diff --git a/test/elf/compressed-debug-info.sh b/test/compressed-debug-info.sh similarity index 100% rename from test/elf/compressed-debug-info.sh rename to test/compressed-debug-info.sh diff --git a/test/elf/copyrel-alignment.sh b/test/copyrel-alignment.sh similarity index 96% rename from test/elf/copyrel-alignment.sh rename to test/copyrel-alignment.sh index 4b265ac7..432179bd 100755 --- a/test/elf/copyrel-alignment.sh +++ b/test/copyrel-alignment.sh @@ -3,7 +3,6 @@ [ $MACHINE = ppc64 ] && skip [ $MACHINE = ppc64le ] && skip -[ $MACHINE = alpha ] && skip [[ $MACHINE = loongarch* ]] && skip cat < + +extern char msg[100]; + +int main() { + printf("%s\n", msg); +} +EOF + +cat < $t/log1 +grep -Fq .copyrel.rel.ro $t/log1 + +$CC -B. $t/a.o $t/b.so -o $t/exe2 -no-pie -Wl,-z,norelro +readelf -W --sections $t/exe2 > $t/log2 +! grep -Fq .copyrel.rel.ro $t/log2 || false diff --git a/test/elf/copyrel-protected.sh b/test/copyrel-protected.sh similarity index 80% rename from test/elf/copyrel-protected.sh rename to test/copyrel-protected.sh index 8c4c0a09..0cd196c4 100755 --- a/test/elf/copyrel-protected.sh +++ b/test/copyrel-protected.sh @@ -3,7 +3,6 @@ [ $MACHINE = ppc64 ] && skip [ $MACHINE = ppc64le ] && skip -[ $MACHINE = alpha ] && skip [[ $MACHINE = loongarch* ]] && skip cat <& $t/log -no-pie || false -grep -Fq 'cannot make copy relocation for protected symbol' $t/log +grep -Fq 'cannot create a copy relocation for protected symbol' $t/log diff --git a/test/elf/copyrel-relro.sh b/test/copyrel-relro.sh similarity index 100% rename from test/elf/copyrel-relro.sh rename to test/copyrel-relro.sh diff --git a/test/elf/copyrel-relro2.sh b/test/copyrel-relro2.sh similarity index 100% rename from test/elf/copyrel-relro2.sh rename to test/copyrel-relro2.sh diff --git a/test/elf/copyrel.sh b/test/copyrel.sh similarity index 100% rename from test/elf/copyrel.sh rename to test/copyrel.sh diff --git a/test/elf/ctors-in-init-array.sh b/test/ctors-in-init-array.sh similarity index 100% rename from test/elf/ctors-in-init-array.sh rename to test/ctors-in-init-array.sh diff --git a/test/elf/dead-debug-sections.sh b/test/dead-debug-sections.sh similarity index 100% rename from test/elf/dead-debug-sections.sh rename to test/dead-debug-sections.sh diff --git a/test/elf/debug-macro-section.sh b/test/debug-macro-section.sh similarity index 100% rename from test/elf/debug-macro-section.sh rename to test/debug-macro-section.sh diff --git a/test/elf/default-symver.sh b/test/default-symver.sh similarity index 100% rename from test/elf/default-symver.sh rename to test/default-symver.sh diff --git a/test/elf/defsym-lto.sh b/test/defsym-lto.sh similarity index 79% rename from test/elf/defsym-lto.sh rename to test/defsym-lto.sh index 3848384b..d60b83df 100755 --- a/test/elf/defsym-lto.sh +++ b/test/defsym-lto.sh @@ -1,8 +1,7 @@ #!/bin/bash . $(dirname $0)/common.inc -echo 'int main() {}' | $CC -flto -o /dev/null -xc - >& /dev/null \ - || skip +test_cflags -flto || skip cat < diff --git a/test/elf/defsym-missing-symbol.sh b/test/defsym-missing-symbol.sh similarity index 100% rename from test/elf/defsym-missing-symbol.sh rename to test/defsym-missing-symbol.sh diff --git a/test/elf/defsym.sh b/test/defsym.sh similarity index 100% rename from test/elf/defsym.sh rename to test/defsym.sh diff --git a/test/elf/defsym2.sh b/test/defsym2.sh similarity index 100% rename from test/elf/defsym2.sh rename to test/defsym2.sh diff --git a/test/demangle-cpp.sh b/test/demangle-cpp.sh new file mode 100755 index 00000000..d4db602d --- /dev/null +++ b/test/demangle-cpp.sh @@ -0,0 +1,19 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat <<'EOF' | $CC -c -o $t/a.o -xc - +void _ZN2ns7versionEv(); +int main() { _ZN2ns7versionEv(); } +EOF + +! $CC -B. -o $t/exe1 $t/a.o 2> $t/log || false +grep -Fq 'ns::version()' $t/log + +cat <<'EOF' | $CC -c -o $t/b.o -xc - +void _ZN2ns7versionEv(); +int main() { _ZN2ns7versionEv(); } +__attribute__((section(".comment"))) char str[] = "rustc version x.y.z\n"; +EOF + +! $CC -B. -o $t/exe2 $t/b.o 2> $t/log || false +grep -Fq 'ns::versionv' $t/log diff --git a/test/elf/demangle-rust.sh b/test/demangle-rust.sh similarity index 100% rename from test/elf/demangle-rust.sh rename to test/demangle-rust.sh diff --git a/test/elf/demangle.sh b/test/demangle.sh similarity index 100% rename from test/elf/demangle.sh rename to test/demangle.sh diff --git a/test/dependency-file-response-file.sh b/test/dependency-file-response-file.sh new file mode 100755 index 00000000..967d0977 --- /dev/null +++ b/test/dependency-file-response-file.sh @@ -0,0 +1,17 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < +int main() { + printf("Hello world\n"); +} +EOF + +echo "$t/a.o -Wl,-dependency-file=$t/dep" > $t/rsp + +$CC -B. -o $t/exe @$t/rsp + +grep -q '/exe:.*/a.o ' $t/dep +grep -q '/a.o:$' $t/dep +! grep -q '^/tmp' $t/dep || false diff --git a/test/elf/dependency-file.sh b/test/dependency-file.sh similarity index 100% rename from test/elf/dependency-file.sh rename to test/dependency-file.sh diff --git a/test/elf/disable-new-dtags.sh b/test/disable-new-dtags.sh similarity index 100% rename from test/elf/disable-new-dtags.sh rename to test/disable-new-dtags.sh diff --git a/test/elf/discard.sh b/test/discard.sh similarity index 91% rename from test/elf/discard.sh rename to test/discard.sh index e419838b..b7628c7e 100755 --- a/test/elf/discard.sh +++ b/test/discard.sh @@ -1,7 +1,8 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = riscv64 -o $MACHINE = riscv32 ] && skip +[[ $MACHINE = riscv* ]] && skip +[[ $MACHINE = loongarch* ]] && skip cat < $t/log +grep -wq foo $t/log +! grep -wq bar $t/log || false diff --git a/test/elf/dynamic-list.sh b/test/dynamic-list.sh similarity index 100% rename from test/elf/dynamic-list.sh rename to test/dynamic-list.sh diff --git a/test/elf/dynamic-list2.sh b/test/dynamic-list2.sh similarity index 100% rename from test/elf/dynamic-list2.sh rename to test/dynamic-list2.sh diff --git a/test/elf/dynamic-list3.sh b/test/dynamic-list3.sh similarity index 100% rename from test/elf/dynamic-list3.sh rename to test/dynamic-list3.sh diff --git a/test/elf/dynamic-list4.sh b/test/dynamic-list4.sh similarity index 100% rename from test/elf/dynamic-list4.sh rename to test/dynamic-list4.sh diff --git a/test/elf/dynamic.sh b/test/dynamic.sh similarity index 84% rename from test/elf/dynamic.sh rename to test/dynamic.sh index ce207c6a..2b9576c2 100755 --- a/test/elf/dynamic.sh +++ b/test/dynamic.sh @@ -9,7 +9,7 @@ readelf --dynamic $t/exe > $t/log grep -Eq 'Shared library:.*\blibc\b' $t/log readelf -W --dyn-syms --use-dynamic $t/exe > $t/log2 -grep -Eq 'FUNC\s+GLOBAL\s+DEFAULT.*UND\s+__libc_start_main' $t/log2 +grep -Eq 'FUNC\s+GLOBAL\s+DEFAULT.*UND\s+__libc_start' $t/log2 cat < diff --git a/test/elf/mold-wrapper2.sh b/test/elf/mold-wrapper2.sh deleted file mode 100755 index 303cad1e..00000000 --- a/test/elf/mold-wrapper2.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -. $(dirname $0)/common.inc - -ldd mold-wrapper.so | grep -q libasan && skip - -nm mold | grep -q '__[at]san_init' && skip - -rm -rf $t -mkdir -p $t/bin $t/lib/mold -cp mold $t/bin -cp mold-wrapper.so $t/bin - -$t/bin/mold -run bash -c 'echo $LD_PRELOAD' | grep -q '/bin/mold-wrapper.so' diff --git a/test/elf/now.sh b/test/elf/now.sh deleted file mode 100755 index 37b83d26..00000000 --- a/test/elf/now.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -. $(dirname $0)/common.inc - -cat < - -void foo() { - printf("Hello world\n"); -} -EOF - -$CC -B. -shared -o $t/b.so $t/a.o -Wl,-z,now -readelf --dynamic $t/b.so | grep -q 'Flags: NOW' - -$CC -B. -shared -o $t/b.so $t/a.o -Wl,-z,now,-z,lazy -readelf --dynamic $t/b.so > $t/log -! grep -q 'Flags: NOW' $t/log || false diff --git a/test/elf/pack-dyn-relocs-relr.sh b/test/elf/pack-dyn-relocs-relr.sh deleted file mode 100755 index c2cad3f8..00000000 --- a/test/elf/pack-dyn-relocs-relr.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -. $(dirname $0)/common.inc - -[ $MACHINE = m68k ] && skip -[ $MACHINE = ppc ] && skip - -command -v llvm-readelf >& /dev/null || skip - -cat < -int main() { - printf("Hello world\n"); -} -EOF - -$CC -B. -o $t/exe1 $t/a.o -pie -llvm-readelf -r $t/exe1 | grep RELATIVE | wc -l > $t/log1 - -$CC -B. -o $t/exe2 $t/a.o -pie -Wl,-pack-dyn-relocs=relr -llvm-readelf -r $t/exe2 | grep RELATIVE | wc -l > $t/log2 - -diff $t/log1 $t/log2 - -llvm-readelf --dynamic $t/exe2 > $t/log3 -grep -wq RELR $t/log3 -grep -wq RELRSZ $t/log3 -grep -wq RELRENT $t/log3 diff --git a/test/elf/relocatable-no-ehframe.sh b/test/elf/relocatable-no-ehframe.sh deleted file mode 100755 index d7c2e1a6..00000000 --- a/test/elf/relocatable-no-ehframe.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -. $(dirname $0)/common.inc - -[ $MACHINE = alpha ] && skip - -# OneTBB isn't tsan-clean -nm mold | grep -q '__tsan_init' && skip - -cat < $t/log1 -! grep -Fq .eh_frame $t/log1 || false - -./mold --relocatable -o $t/b.o $t/a.o -readelf -WS $t/b.o > $t/log2 -! grep -Fq .eh_frame $t/log2 || false diff --git a/test/elf/run.sh b/test/elf/run.sh deleted file mode 100755 index 2863134e..00000000 --- a/test/elf/run.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash -. $(dirname $0)/common.inc - -[ "$CC" = cc ] || skip - -# ASAN doesn't work with LD_PRELOAD -nm mold | grep -q '__[at]san_init' && skip - -cat <<'EOF' | $CC -xc -c -o $t/a.o - -#include - -int main() { - printf("Hello\n"); - return 0; -} -EOF - -LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=`pwd`/mold \ - $GCC -o $t/exe $t/a.o -B/usr/bin -readelf -p .comment $t/exe > $t/log -grep -q '[ms]old' $t/log - -./mold -run env | grep -q '^MOLD_PATH=.*/mold$' - -./mold -run /usr/bin/ld --version | grep -q '[ms]old' -./mold -run /usr/bin/ld.lld --version | grep -q '[ms]old' -./mold -run /usr/bin/ld.gold --version | grep -q '[ms]old' - -rm -f $t/ld $t/ld.lld $t/ld.gold $t/foo.ld -touch $t/ld $t/ld.lld $t/ld.gold -echo "#!/bin/sh" >$t/foo.ld -chmod 755 $t/ld $t/ld.lld $t/ld.gold $t/foo.ld - -./mold -run $t/ld --version | grep -q '[ms]old' -./mold -run $t/ld.lld --version | grep -q '[ms]old' -./mold -run $t/ld.gold --version | grep -q '[ms]old' -./mold -run $t/foo.ld --version | grep -q '[ms]old' && false - -cat <<'EOF' > $t/sh -#!/bin/sh -$1 --version -EOF - -chmod 755 $t/sh - -./mold -run $t/sh ld --version | grep -q '[ms]old' -./mold -run $t/sh foo.ld --version >& /dev/null | grep -q '[ms]old' && false - -./mold -run $t/sh $t/ld --version | grep -q '[ms]old' -./mold -run $t/sh $t/ld.lld --version | grep -q '[ms]old' -./mold -run $t/sh $t/ld.gold --version | grep -q '[ms]old' -./mold -run $t/sh $t/foo.ld --version | grep -q '[ms]old' && false diff --git a/test/elf/shared-abs-sym.sh b/test/elf/shared-abs-sym.sh deleted file mode 100755 index 778d8bad..00000000 --- a/test/elf/shared-abs-sym.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -. $(dirname $0)/common.inc - -cat < $t/c.c -#include -extern char foo; -int main() { printf("foo=%p\n", &foo); } -EOF - -$CC -fPIC -c -o $t/d.o $t/c.c -$CC -B. -o $t/exe1 -pie $t/d.o $t/b.so -$QEMU $t/exe1 | grep -q 'foo=0x3' - -nm -D $t/exe1 > $t/log1 -! grep -q foo $t/log1 || false - -$CC -fPIC -c -o $t/e.o $t/c.c -$CC -B. -o $t/exe2 -no-pie $t/e.o $t/b.so -$QEMU $t/exe2 | grep -q 'foo=0x3' - -nm -D $t/exe2 > $t/log2 -! grep -q foo $t/log2 || false diff --git a/test/elf/z-pack-relative-relocs.sh b/test/elf/z-pack-relative-relocs.sh deleted file mode 100755 index e09d441e..00000000 --- a/test/elf/z-pack-relative-relocs.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -. $(dirname $0)/common.inc - -cat < -int main() { - printf("Hello world\n"); -} -EOF - -$CC -B. -o $t/exe $t/a.o -pie -Wl,-z,pack-relative-relocs - -readelf -W -V $t/exe > $t/log -grep -Fq GLIBC_2. $t/log || skip - -grep -q GLIBC_ABI_DT_RELR $t/log diff --git a/test/elf/z-start-stop-visibility.sh b/test/elf/z-start-stop-visibility.sh deleted file mode 100755 index 7efc94b5..00000000 --- a/test/elf/z-start-stop-visibility.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -. $(dirname $0)/common.inc - -./mold -z start-stop-visibility=hidden --version > /dev/null -! ./mold -z start-stop-visibility=protected --version 2> $t/log -grep -q 'unsupported visibility: protected' $t/log diff --git a/test/elf/emit-relocs-cpp.sh b/test/emit-relocs-cpp.sh similarity index 100% rename from test/elf/emit-relocs-cpp.sh rename to test/emit-relocs-cpp.sh diff --git a/test/elf/emit-relocs-dead-sections.sh b/test/emit-relocs-dead-sections.sh similarity index 100% rename from test/elf/emit-relocs-dead-sections.sh rename to test/emit-relocs-dead-sections.sh diff --git a/test/elf/emit-relocs.sh b/test/emit-relocs.sh similarity index 100% rename from test/elf/emit-relocs.sh rename to test/emit-relocs.sh diff --git a/test/empty-arg.sh b/test/empty-arg.sh new file mode 100755 index 00000000..60182b07 --- /dev/null +++ b/test/empty-arg.sh @@ -0,0 +1,5 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +! ./mold -m elf_x86_64 '' >& $t/log +grep -q 'cannot open :' $t/log diff --git a/test/elf/empty-file.sh b/test/empty-file.sh similarity index 100% rename from test/elf/empty-file.sh rename to test/empty-file.sh diff --git a/test/elf/empty-input.sh b/test/empty-input.sh similarity index 100% rename from test/elf/empty-input.sh rename to test/empty-input.sh diff --git a/test/elf/empty-version.sh b/test/empty-version.sh similarity index 100% rename from test/elf/empty-version.sh rename to test/empty-version.sh diff --git a/test/elf/entry.sh b/test/entry.sh similarity index 100% rename from test/elf/entry.sh rename to test/entry.sh diff --git a/test/exception-multiple-ehframe.sh b/test/exception-multiple-ehframe.sh new file mode 100755 index 00000000..c411eb92 --- /dev/null +++ b/test/exception-multiple-ehframe.sh @@ -0,0 +1,48 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +nm mold | grep -q '__tsan_init' && skip + +command -v perl > /dev/null || skip + +[ $MACHINE = sh4 ] && skip + +cat < + +int foo(); +int bar(); + +int main() { + printf("%d %d\n", foo(), bar()); +} +EOF + +$CXX -B. -o $t/exe1 $t/d.o $t/c.o +$QEMU $t/exe1 | grep -q '^1 3$' diff --git a/test/elf/exception.sh b/test/exception.sh similarity index 93% rename from test/elf/exception.sh rename to test/exception.sh index a713a0db..e054ec52 100755 --- a/test/elf/exception.sh +++ b/test/exception.sh @@ -1,11 +1,8 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = m68k ] && skip -[ $MACHINE = sh4 ] && skip - static= -test_cflags -static && static=-static +test_cxxflags -static && static=-static # I don't know why, but we need -pthread on m68k static="$static -pthread" diff --git a/test/elf/exclude-libs.sh b/test/exclude-libs.sh similarity index 84% rename from test/elf/exclude-libs.sh rename to test/exclude-libs.sh index eb390310..39243991 100755 --- a/test/elf/exclude-libs.sh +++ b/test/exclude-libs.sh @@ -48,6 +48,12 @@ readelf --dyn-syms $t/f.so > $t/log ! grep -Fq bar $t/log || false grep -Fq baz $t/log +$CC -B. -shared -o $t/f.so $t/e.o $t/c.a $t/d.a -Wl,-exclude-libs=c.a:d.a +readelf --dyn-syms $t/f.so > $t/log +! grep -Fq foo $t/log || false +! grep -Fq bar $t/log || false +grep -Fq baz $t/log + $CC -B. -shared -o $t/f.so $t/e.o $t/c.a $t/d.a -Wl,-exclude-libs=ALL readelf --dyn-syms $t/f.so > $t/log ! grep -Fq foo $t/log || false diff --git a/test/elf/exclude-libs2.sh b/test/exclude-libs2.sh similarity index 100% rename from test/elf/exclude-libs2.sh rename to test/exclude-libs2.sh diff --git a/test/elf/exclude-libs3.sh b/test/exclude-libs3.sh similarity index 100% rename from test/elf/exclude-libs3.sh rename to test/exclude-libs3.sh diff --git a/test/elf/execstack.sh b/test/execstack.sh similarity index 100% rename from test/elf/execstack.sh rename to test/execstack.sh diff --git a/test/elf/execute-only.sh b/test/execute-only.sh similarity index 94% rename from test/elf/execute-only.sh rename to test/execute-only.sh index 7af00395..ecfe2700 100755 --- a/test/elf/execute-only.sh +++ b/test/execute-only.sh @@ -6,6 +6,7 @@ # GCC emits data to .text for PPC64, so PPC64 is not compatible with -execute-only [ $MACHINE = ppc64 ] && skip +[ $MACHINE = ppc64le ] && skip cat < diff --git a/test/elf/export-dynamic.sh b/test/export-dynamic.sh similarity index 100% rename from test/elf/export-dynamic.sh rename to test/export-dynamic.sh diff --git a/test/elf/export-from-exe.sh b/test/export-from-exe.sh similarity index 100% rename from test/elf/export-from-exe.sh rename to test/export-from-exe.sh diff --git a/test/elf/fatal-warnings.sh b/test/fatal-warnings.sh similarity index 100% rename from test/elf/fatal-warnings.sh rename to test/fatal-warnings.sh diff --git a/test/elf/filler.sh b/test/filler.sh similarity index 100% rename from test/elf/filler.sh rename to test/filler.sh diff --git a/test/elf/filter.sh b/test/filter.sh similarity index 100% rename from test/elf/filter.sh rename to test/filter.sh diff --git a/test/elf/func-addr.sh b/test/func-addr.sh similarity index 100% rename from test/elf/func-addr.sh rename to test/func-addr.sh diff --git a/test/elf/gc-sections.sh b/test/gc-sections.sh similarity index 100% rename from test/elf/gc-sections.sh rename to test/gc-sections.sh diff --git a/test/elf/gdb-index-compress-output.sh b/test/gdb-index-compress-output.sh similarity index 97% rename from test/elf/gdb-index-compress-output.sh rename to test/gdb-index-compress-output.sh index f3c90786..0b180ac9 100755 --- a/test/elf/gdb-index-compress-output.sh +++ b/test/gdb-index-compress-output.sh @@ -1,7 +1,7 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = $HOST ] || skip +on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip diff --git a/test/elf/gdb-index-dwarf2.sh b/test/gdb-index-dwarf2.sh similarity index 97% rename from test/elf/gdb-index-dwarf2.sh rename to test/gdb-index-dwarf2.sh index ffc25fea..79497935 100755 --- a/test/elf/gdb-index-dwarf2.sh +++ b/test/gdb-index-dwarf2.sh @@ -1,7 +1,7 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = $HOST ] || skip +on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip diff --git a/test/elf/gdb-index-dwarf3.sh b/test/gdb-index-dwarf3.sh similarity index 97% rename from test/elf/gdb-index-dwarf3.sh rename to test/gdb-index-dwarf3.sh index 03af7751..a093eade 100755 --- a/test/elf/gdb-index-dwarf3.sh +++ b/test/gdb-index-dwarf3.sh @@ -1,7 +1,7 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = $HOST ] || skip +on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip diff --git a/test/elf/gdb-index-dwarf4.sh b/test/gdb-index-dwarf4.sh similarity index 97% rename from test/elf/gdb-index-dwarf4.sh rename to test/gdb-index-dwarf4.sh index 6af263cf..e028fa99 100755 --- a/test/elf/gdb-index-dwarf4.sh +++ b/test/gdb-index-dwarf4.sh @@ -1,7 +1,7 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = $HOST ] || skip +on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip diff --git a/test/elf/gdb-index-dwarf5.sh b/test/gdb-index-dwarf5.sh similarity index 85% rename from test/elf/gdb-index-dwarf5.sh rename to test/gdb-index-dwarf5.sh index 1d4b5ad9..1f3ebc84 100755 --- a/test/elf/gdb-index-dwarf5.sh +++ b/test/gdb-index-dwarf5.sh @@ -1,7 +1,7 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = $HOST ] || skip +on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip @@ -65,6 +65,8 @@ $CC -c -o $t/d.o $t/d.c -fPIC -g -ggnu-pubnames -gdwarf-5 -ffunction-sections $CC -B. -shared -o $t/e.so $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--gdb-index readelf -WS $t/e.so 2> /dev/null | grep -Fq .gdb_index +readelf --debug=gdb_index $t/e.so 2> /dev/null | grep -q 'fn1: .* \[global, function\]' +readelf --debug=gdb_index $t/e.so 2> /dev/null | grep -q 'char: .* \[static, type\]' cat < /dev/null | grep -Fq .gdb_index +readelf --debug=gdb_index $t/exe 2> /dev/null | grep -q 'main: .* \[global, function\]' $QEMU $t/exe | grep -q 'Hello world' diff --git a/test/elf/gdb-index-dwarf64.sh b/test/gdb-index-dwarf64.sh similarity index 98% rename from test/elf/gdb-index-dwarf64.sh rename to test/gdb-index-dwarf64.sh index 3d7b5913..819956ef 100755 --- a/test/elf/gdb-index-dwarf64.sh +++ b/test/gdb-index-dwarf64.sh @@ -1,7 +1,7 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = $HOST ] || skip +on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip diff --git a/test/elf/gdb-index-empty.sh b/test/gdb-index-empty.sh similarity index 100% rename from test/elf/gdb-index-empty.sh rename to test/gdb-index-empty.sh diff --git a/test/elf/gdb-index-split-dwarf.sh b/test/gdb-index-split-dwarf.sh similarity index 97% rename from test/elf/gdb-index-split-dwarf.sh rename to test/gdb-index-split-dwarf.sh index 40eb5d62..cbb1d30f 100755 --- a/test/elf/gdb-index-split-dwarf.sh +++ b/test/gdb-index-split-dwarf.sh @@ -1,7 +1,7 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = $HOST ] || skip +on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip diff --git a/test/elf/glibc-2.22-bug.sh b/test/glibc-2.22-bug.sh similarity index 94% rename from test/elf/glibc-2.22-bug.sh rename to test/glibc-2.22-bug.sh index 1539d209..27820acc 100755 --- a/test/elf/glibc-2.22-bug.sh +++ b/test/glibc-2.22-bug.sh @@ -1,7 +1,6 @@ #!/bin/bash . $(dirname $0)/common.inc -[ $MACHINE = alpha ] && skip # glibc 2.22 or prior have a bug that ld-linux.so.2 crashes on dlopen() # if .rela.dyn and .rela.plt are not contiguous in a given DSO. diff --git a/test/elf/global-offset-table.sh b/test/global-offset-table.sh similarity index 100% rename from test/elf/global-offset-table.sh rename to test/global-offset-table.sh diff --git a/test/elf/gnu-hash.sh b/test/gnu-hash.sh similarity index 100% rename from test/elf/gnu-hash.sh rename to test/gnu-hash.sh diff --git a/test/gnu-property.sh b/test/gnu-property.sh new file mode 100755 index 00000000..aff85c01 --- /dev/null +++ b/test/gnu-property.sh @@ -0,0 +1,10 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < /dev/null + __attribute__((retain)) int foo() {} +int bar() {} +int main() {} +EOF + +# Older versions of GCC does not support __attribute__((retain)) +readelf -WS $t/a.o | grep -q '\.text\.foo.*AXR' || skip + +$CC -B. -o $t/exe $t/a.o -Wl,-gc-sections +nm $t/exe > $t/log +grep -q foo $t/log +! grep -q bar $t/log || false diff --git a/test/elf/gnu-unique.sh b/test/gnu-unique.sh similarity index 100% rename from test/elf/gnu-unique.sh rename to test/gnu-unique.sh diff --git a/test/elf/gnu-warning.sh b/test/gnu-warning.sh similarity index 100% rename from test/elf/gnu-warning.sh rename to test/gnu-warning.sh diff --git a/test/elf/hash-style.sh b/test/hash-style.sh similarity index 100% rename from test/elf/hash-style.sh rename to test/hash-style.sh diff --git a/test/elf/hello-dynamic.sh b/test/hello-dynamic.sh similarity index 100% rename from test/elf/hello-dynamic.sh rename to test/hello-dynamic.sh diff --git a/test/elf/hello-static.sh b/test/hello-static.sh similarity index 100% rename from test/elf/hello-static.sh rename to test/hello-static.sh diff --git a/test/elf/help.sh b/test/help.sh similarity index 100% rename from test/elf/help.sh rename to test/help.sh diff --git a/test/hidden-archive.sh b/test/hidden-archive.sh new file mode 100755 index 00000000..9364e198 --- /dev/null +++ b/test/hidden-archive.sh @@ -0,0 +1,21 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < #include diff --git a/test/elf/large-alignment-dso.sh b/test/large-alignment-dso.sh similarity index 100% rename from test/elf/large-alignment-dso.sh rename to test/large-alignment-dso.sh diff --git a/test/elf/large-alignment.sh b/test/large-alignment.sh similarity index 100% rename from test/elf/large-alignment.sh rename to test/large-alignment.sh diff --git a/test/elf/large-max-page-size-strip.sh b/test/large-max-page-size-strip.sh similarity index 100% rename from test/elf/large-max-page-size-strip.sh rename to test/large-max-page-size-strip.sh diff --git a/test/elf/large-max-page-size.sh b/test/large-max-page-size.sh similarity index 100% rename from test/elf/large-max-page-size.sh rename to test/large-max-page-size.sh diff --git a/test/elf/large-text.sh b/test/large-text.sh similarity index 100% rename from test/elf/large-text.sh rename to test/large-text.sh diff --git a/test/library.sh b/test/library.sh new file mode 100755 index 00000000..91d40bff --- /dev/null +++ b/test/library.sh @@ -0,0 +1,22 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < +void hello() { + printf("Hello world\n"); +} +EOF + +$CC -B. -shared -o $t/libfoobar.so $t/a.o + +cat < $t/b.script + +! $CC -B. -o $t/exe $t/a.o $t/b.script 2> $t/log +grep -q 'unclosed comment' $t/log diff --git a/test/elf/linker-script-relocatable.sh b/test/linker-script-relocatable.sh similarity index 100% rename from test/elf/linker-script-relocatable.sh rename to test/linker-script-relocatable.sh diff --git a/test/elf/linker-script.sh b/test/linker-script.sh similarity index 100% rename from test/elf/linker-script.sh rename to test/linker-script.sh diff --git a/test/elf/linker-script2.sh b/test/linker-script2.sh similarity index 100% rename from test/elf/linker-script2.sh rename to test/linker-script2.sh diff --git a/test/elf/linker-script3.sh b/test/linker-script3.sh similarity index 100% rename from test/elf/linker-script3.sh rename to test/linker-script3.sh diff --git a/test/elf/linker-script4.sh b/test/linker-script4.sh similarity index 100% rename from test/elf/linker-script4.sh rename to test/linker-script4.sh diff --git a/test/linker-script5.sh b/test/linker-script5.sh new file mode 100755 index 00000000..f31b29b5 --- /dev/null +++ b/test/linker-script5.sh @@ -0,0 +1,14 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +mkdir -p $t/foo + +cat < $t/foo/b.script +INPUT(a.o) +EOF + +$CC -B. -o $t/exe $t/foo/b.script diff --git a/test/linker-script6.sh b/test/linker-script6.sh new file mode 100755 index 00000000..e767f33e --- /dev/null +++ b/test/linker-script6.sh @@ -0,0 +1,15 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +mkdir -p $t/foo + +cat < $t/foo/b.script + +INPUT(a.o) +EOF + +$CC -B. -o $t/exe $t/foo/b.script diff --git a/test/elf/lto-archive.sh b/test/lto-archive.sh similarity index 87% rename from test/elf/lto-archive.sh rename to test/lto-archive.sh index 88ce90f0..3938a9b2 100755 --- a/test/elf/lto-archive.sh +++ b/test/lto-archive.sh @@ -2,9 +2,7 @@ . $(dirname $0)/common.inc [ "$CC" = cc ] || skip - -echo 'int main() {}' | $CC -flto -o /dev/null -xc - >& /dev/null \ - || skip +test_cflags -flto || skip cat < diff --git a/test/lto-archive2.sh b/test/lto-archive2.sh new file mode 100755 index 00000000..8a63b9d7 --- /dev/null +++ b/test/lto-archive2.sh @@ -0,0 +1,15 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +test_cflags -flto=auto || skip + +echo | $CC -o $t/a.o -c -flto=auto -xc - + +rm -f $t/b.a +ar rc $t/b.a $t/a.o + +cat <& /dev/null \ - || skip +test_cflags -flto || skip cat <& /dev/null \ +echo 'int main() {}' | $GCC -B. -flto -o /dev/null -xc - >& /dev/null \ || skip cat <& /dev/null \ +echo 'int main() {}' | clang -B. -flto -o /dev/null -xc - >& /dev/null \ || skip cat < $t/a.s -seq 1 100000 | sed 's/.*/.section .data.\0,"aw"\n.globl x\0\nx\0: .word 0\n/g' >> $t/a.s +seq 1 100000 | sed 's/.*/.section .data.&,"aw"\n.globl x&\nx&: .word 0\n/g' >> $t/a.s $CC -c -xassembler -o $t/a.o $t/a.s ./mold --relocatable -o $t/b.o $t/a.o diff --git a/test/elf/mergeable-strings.sh b/test/mergeable-strings.sh similarity index 100% rename from test/elf/mergeable-strings.sh rename to test/mergeable-strings.sh diff --git a/test/elf/missing-but-ok.sh b/test/missing-but-ok.sh similarity index 100% rename from test/elf/missing-but-ok.sh rename to test/missing-but-ok.sh diff --git a/test/elf/missing-error.sh b/test/missing-error.sh similarity index 100% rename from test/elf/missing-error.sh rename to test/missing-error.sh diff --git a/test/elf/mold-wrapper.sh b/test/mold-wrapper.sh similarity index 98% rename from test/elf/mold-wrapper.sh rename to test/mold-wrapper.sh index 2bd0bb99..4748c8d7 100755 --- a/test/elf/mold-wrapper.sh +++ b/test/mold-wrapper.sh @@ -8,7 +8,7 @@ ldd mold-wrapper.so | grep -q libasan && skip nm mold | grep -q '__[at]san_init' && skip cat <<'EOF' > $t/a.sh -#!/bin/bash +#!/usr/bin/env bash echo "$0" "$@" $FOO EOF diff --git a/test/mold-wrapper2.sh b/test/mold-wrapper2.sh new file mode 100755 index 00000000..efeaf05b --- /dev/null +++ b/test/mold-wrapper2.sh @@ -0,0 +1,7 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +ldd mold-wrapper.so | grep -q libasan && skip +nm mold | grep -q '__[at]san_init' && skip + +./mold -run bash -c 'echo $LD_PRELOAD' | grep -Fq mold-wrapper.so diff --git a/test/nmagic.sh b/test/nmagic.sh new file mode 100755 index 00000000..2590946e --- /dev/null +++ b/test/nmagic.sh @@ -0,0 +1,14 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat <& $t/log || false +grep -Fq 'undefined symbol: foo' $t/log diff --git a/test/elf/no-eh-frame-header.sh b/test/no-eh-frame-header.sh similarity index 100% rename from test/elf/no-eh-frame-header.sh rename to test/no-eh-frame-header.sh diff --git a/test/elf/bug178.sh b/test/no-object-file.sh similarity index 100% rename from test/elf/bug178.sh rename to test/no-object-file.sh diff --git a/test/elf/no-quick-exit.sh b/test/no-quick-exit.sh similarity index 100% rename from test/elf/no-quick-exit.sh rename to test/no-quick-exit.sh diff --git a/test/elf/no-undefined-version.sh b/test/no-undefined-version.sh similarity index 100% rename from test/elf/no-undefined-version.sh rename to test/no-undefined-version.sh diff --git a/test/elf/nocopyreloc.sh b/test/nocopyreloc.sh similarity index 95% rename from test/elf/nocopyreloc.sh rename to test/nocopyreloc.sh index 06165fbc..bcfa044f 100755 --- a/test/elf/nocopyreloc.sh +++ b/test/nocopyreloc.sh @@ -7,7 +7,6 @@ [ $MACHINE = ppc64 ] && skip [ $MACHINE = ppc64le ] && skip [ $MACHINE = sh4 ] && skip -[ $MACHINE = alpha ] && skip [[ $MACHINE = loongarch* ]] && skip cat < +int main() { + printf("Hello world\n"); +} +EOF + +$CC -B. -o $t/exe1 $t/a.o -Wl,-package-metadata='{"foo":"bar"}' +readelf -x .note.package $t/exe1 | grep -Fq '{"foo":"bar"}' + +$CC -B. -o $t/exe2 $t/a.o -Wl,--encoded-package-metadata=%7B%22foo%22%3A%22bar%22%7D +readelf -x .note.package $t/exe2 | grep -Fq '{"foo":"bar"}' + +! $CC -B. -o $t/exe3 $t/a.o -Wl,--encoded-package-metadata=foo%x >& $t/log +grep -q 'invalid string: foo%x' $t/log diff --git a/test/elf/physical-image-base.sh b/test/physical-image-base.sh similarity index 100% rename from test/elf/physical-image-base.sh rename to test/physical-image-base.sh diff --git a/test/elf/pie.sh b/test/pie.sh similarity index 100% rename from test/elf/pie.sh rename to test/pie.sh diff --git a/test/elf/plt-dso.sh b/test/plt-dso.sh similarity index 100% rename from test/elf/plt-dso.sh rename to test/plt-dso.sh diff --git a/test/elf/pltgot.sh b/test/pltgot.sh similarity index 100% rename from test/elf/pltgot.sh rename to test/pltgot.sh diff --git a/test/elf/preinit-array.sh b/test/preinit-array.sh similarity index 100% rename from test/elf/preinit-array.sh rename to test/preinit-array.sh diff --git a/test/elf/print-dependencies.sh b/test/print-dependencies.sh similarity index 100% rename from test/elf/print-dependencies.sh rename to test/print-dependencies.sh diff --git a/test/elf/protected-dynsym.sh b/test/protected-dynsym.sh similarity index 100% rename from test/elf/protected-dynsym.sh rename to test/protected-dynsym.sh diff --git a/test/elf/protected.sh b/test/protected.sh similarity index 100% rename from test/elf/protected.sh rename to test/protected.sh diff --git a/test/elf/push-pop-state.sh b/test/push-pop-state.sh similarity index 100% rename from test/elf/push-pop-state.sh rename to test/push-pop-state.sh diff --git a/test/elf/range-extension-thunk.sh b/test/range-extension-thunk.sh similarity index 83% rename from test/elf/range-extension-thunk.sh rename to test/range-extension-thunk.sh index 57b3fc3d..065287dd 100755 --- a/test/elf/range-extension-thunk.sh +++ b/test/range-extension-thunk.sh @@ -4,13 +4,19 @@ # Skip if 32 bits as we use very large addresses in this test. [ $MACHINE = i686 ] && skip [ $MACHINE = riscv32 ] && skip +[ $MACHINE = m68k ] && skip # It looks like SPARC's runtime can't handle PLT if it's too far from GOT. [ $MACHINE = sparc64 ] && skip +# Current LoongArch compilers emit BL for function calls, but I believe +# they'll emit PCADDU18I + JIRL (which can address PC ± 128 GiB) in the +# future. +[[ $MACHINE = loongarch* ]] && skip + # qemu aborts with the "Unknown exception 0x5" error, although this # test passes on a real POWER10 machine. -[ -n "$QEMU" -a "$CPU" = power10 ] && skip +on_qemu && [ "$CPU" = power10 ] && skip cat < $t/a.c #include diff --git a/test/elf/range-extension-thunk2.sh b/test/range-extension-thunk2.sh similarity index 100% rename from test/elf/range-extension-thunk2.sh rename to test/range-extension-thunk2.sh diff --git a/test/range-extension-thunk3.sh b/test/range-extension-thunk3.sh new file mode 100755 index 00000000..cdc8e2b9 --- /dev/null +++ b/test/range-extension-thunk3.sh @@ -0,0 +1,16 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +[ $MACHINE = sh4 ] && skip + +seq 1 10000 | sed 's/.*/void func&() {}/' > $t/a.c +$CC -B. -o $t/b.so -shared $t/a.c + +seq 1 10000 | sed 's/.*/void func&();/' > $t/c.c +echo 'int main() {' >> $t/c.c +seq 1 10000 | sed 's/.*/func&();/' >> $t/c.c +echo '}' >> $t/c.c + +$CC -c -o $t/d.o $t/c.c +$CC -B. -o $t/exe $t/d.o $t/b.so +$QEMU $t/exe diff --git a/test/elf/relax-got-load.sh b/test/relax-got-load.sh similarity index 100% rename from test/elf/relax-got-load.sh rename to test/relax-got-load.sh diff --git a/test/elf/reloc-rodata.sh b/test/reloc-rodata.sh similarity index 100% rename from test/elf/reloc-rodata.sh rename to test/reloc-rodata.sh diff --git a/test/elf/relocatable-archive.sh b/test/relocatable-archive.sh similarity index 100% rename from test/elf/relocatable-archive.sh rename to test/relocatable-archive.sh diff --git a/test/elf/relocatable-c++.sh b/test/relocatable-c++.sh similarity index 88% rename from test/elf/relocatable-c++.sh rename to test/relocatable-c++.sh index e20cdfe5..6ce5e7be 100755 --- a/test/elf/relocatable-c++.sh +++ b/test/relocatable-c++.sh @@ -4,10 +4,6 @@ # OneTBB isn't tsan-clean nm mold | grep -q '__tsan_init' && skip -# Ubuntu 22.04 GCC is broken -[ $MACHINE = m68k ] && skip -[ $MACHINE = sh4 ] && skip - cat <& /dev/null || skip + +cat < +void hello() { printf("Hello world\n"); } +EOF + +cat < /dev/null 2> $t/log +! grep -q Warning $t/log || false diff --git a/test/elf/relocatable-exception.sh b/test/relocatable-exception.sh similarity index 100% rename from test/elf/relocatable-exception.sh rename to test/relocatable-exception.sh diff --git a/test/elf/relocatable-many-sections.sh b/test/relocatable-many-sections.sh similarity index 100% rename from test/elf/relocatable-many-sections.sh rename to test/relocatable-many-sections.sh diff --git a/test/elf/relocatable-merge-sections.sh b/test/relocatable-merge-sections.sh similarity index 100% rename from test/elf/relocatable-merge-sections.sh rename to test/relocatable-merge-sections.sh diff --git a/test/elf/relocatable-mergeable-sections.sh b/test/relocatable-mergeable-sections.sh similarity index 100% rename from test/elf/relocatable-mergeable-sections.sh rename to test/relocatable-mergeable-sections.sh diff --git a/test/elf/relocatable.sh b/test/relocatable.sh similarity index 100% rename from test/elf/relocatable.sh rename to test/relocatable.sh diff --git a/test/elf/relro.sh b/test/relro.sh similarity index 100% rename from test/elf/relro.sh rename to test/relro.sh diff --git a/test/elf/repro.sh b/test/repro.sh similarity index 71% rename from test/elf/repro.sh rename to test/repro.sh index 0250f2d3..f8b84a71 100755 --- a/test/elf/repro.sh +++ b/test/repro.sh @@ -18,12 +18,14 @@ $CC -B. -o $t/exe $t/a.o $CC -B. -o $t/exe $t/a.o -Wl,-repro tar -C $t -xf $t/exe.repro.tar +tar -C $t -tvf $t/exe.repro.tar | grep -q ' exe.repro/.*/a.o' grep -q /a.o $t/exe.repro/response.txt -grep -q '[ms]old' $t/exe.repro/version.txt +grep -q mold $t/exe.repro/version.txt rm -rf $t/exe.repro $t/exe.repro.tar MOLD_REPRO=1 $CC -B. -o $t/exe $t/a.o +tar -C $t -tvf $t/exe.repro.tar | grep -q ' exe.repro/.*/a.o' tar -C $t -xf $t/exe.repro.tar grep -q /a.o $t/exe.repro/response.txt -grep -q '[ms]old' $t/exe.repro/version.txt +grep -q mold $t/exe.repro/version.txt diff --git a/test/elf/require-defined.sh b/test/require-defined.sh similarity index 100% rename from test/elf/require-defined.sh rename to test/require-defined.sh diff --git a/test/elf/response-file.sh b/test/response-file.sh similarity index 100% rename from test/elf/response-file.sh rename to test/response-file.sh diff --git a/test/elf/response-file2.sh b/test/response-file2.sh similarity index 100% rename from test/elf/response-file2.sh rename to test/response-file2.sh diff --git a/test/elf/retain-symbols-file.sh b/test/retain-symbols-file.sh similarity index 69% rename from test/elf/retain-symbols-file.sh rename to test/retain-symbols-file.sh index 91c2ddfa..dba11d3c 100755 --- a/test/elf/retain-symbols-file.sh +++ b/test/retain-symbols-file.sh @@ -16,8 +16,8 @@ EOF $CC -B. -o $t/exe $t/a.o -Wl,--retain-symbols-file=$t/symbols readelf -W --symbols $t/exe > $t/log -! grep -qw foo $t/log || false -! grep -qw bar $t/log || false -! grep -qw main $t/log || false +! grep -q ' foo$' $t/log || false +! grep -q ' bar$' $t/log || false +! grep -q ' main$' $t/log || false -grep -qw baz $t/log +grep -q ' baz$' $t/log diff --git a/test/elf/reverse-sections.sh b/test/reverse-sections.sh similarity index 100% rename from test/elf/reverse-sections.sh rename to test/reverse-sections.sh diff --git a/test/elf/rodata-name.sh b/test/rodata-name.sh similarity index 100% rename from test/elf/rodata-name.sh rename to test/rodata-name.sh diff --git a/test/elf/rosegment.sh b/test/rosegment.sh similarity index 100% rename from test/elf/rosegment.sh rename to test/rosegment.sh diff --git a/test/elf/rpath.sh b/test/rpath.sh similarity index 100% rename from test/elf/rpath.sh rename to test/rpath.sh diff --git a/test/elf/run-clang.sh b/test/run-clang.sh similarity index 94% rename from test/elf/run-clang.sh rename to test/run-clang.sh index cb0df913..b6ce86d0 100755 --- a/test/elf/run-clang.sh +++ b/test/run-clang.sh @@ -20,4 +20,4 @@ EOF LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=`pwd`/mold \ clang -no-pie -o $t/exe $t/a.o -fuse-ld=/usr/bin/ld readelf -p .comment $t/exe > $t/log -grep -q '[ms]old' $t/log +grep -q mold $t/log diff --git a/test/run.sh b/test/run.sh new file mode 100755 index 00000000..e6257636 --- /dev/null +++ b/test/run.sh @@ -0,0 +1,52 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +[ "$CC" = cc ] || skip + +# ASAN doesn't work with LD_PRELOAD +nm mold | grep -q '__[at]san_init' && skip + +cat <<'EOF' | $CC -xc -c -o $t/a.o - +#include + +int main() { + printf("Hello\n"); + return 0; +} +EOF + +LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=`pwd`/mold \ + $CC -o $t/exe $t/a.o -B/usr/bin +readelf -p .comment $t/exe > $t/log +grep -q mold $t/log + +./mold -run env | grep -q '^MOLD_PATH=.*/mold$' + +./mold -run /usr/bin/ld --version | grep -q mold +./mold -run /usr/bin/ld.lld --version | grep -q mold +./mold -run /usr/bin/ld.gold --version | grep -q mold + +rm -f $t/ld $t/ld.lld $t/ld.gold $t/foo.ld +touch $t/ld $t/ld.lld $t/ld.gold +echo "#!/bin/sh" >$t/foo.ld +chmod 755 $t/ld $t/ld.lld $t/ld.gold $t/foo.ld + +./mold -run $t/ld --version | grep -q mold +./mold -run $t/ld.lld --version | grep -q mold +./mold -run $t/ld.gold --version | grep -q mold +./mold -run $t/foo.ld --version | grep -q mold && false + +cat <<'EOF' > $t/sh +#!/bin/sh +$1 --version +EOF + +chmod 755 $t/sh + +./mold -run $t/sh ld --version | grep -q mold +./mold -run $t/sh foo.ld --version >& /dev/null | grep -q mold && false + +./mold -run $t/sh $t/ld --version | grep -q mold +./mold -run $t/sh $t/ld.lld --version | grep -q mold +./mold -run $t/sh $t/ld.gold --version | grep -q mold +./mold -run $t/sh $t/foo.ld --version | grep -q mold && false diff --git a/test/elf/section-align.sh b/test/section-align.sh similarity index 100% rename from test/elf/section-align.sh rename to test/section-align.sh diff --git a/test/section-attributes.sh b/test/section-attributes.sh new file mode 100755 index 00000000..63eacf35 --- /dev/null +++ b/test/section-attributes.sh @@ -0,0 +1,24 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < diff --git a/test/elf/section-start.sh b/test/section-start.sh similarity index 100% rename from test/elf/section-start.sh rename to test/section-start.sh diff --git a/test/separate-debug-file.sh b/test/separate-debug-file.sh new file mode 100755 index 00000000..7430c94e --- /dev/null +++ b/test/separate-debug-file.sh @@ -0,0 +1,28 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +nm mold | grep -q '__tsan_init' && skip +on_qemu && skip +command -v gdb >& /dev/null || skip +command -v flock >& /dev/null || skip + +cat < $t/a.c +#include +int main() { + printf("Hello world\n"); +} +EOF + +$CC -c -o $t/a.o $t/a.c -g +$CC -B. -o $t/exe1 $t/a.o -Wl,--separate-debug-file +readelf -SW $t/exe1 | grep -Fq .gnu_debuglink + +flock $t/exe1 true +gdb $t/exe1 -ex 'list main' -ex 'quit' | grep -Fq printf + +$CC -c -o $t/a.o $t/a.c -g +$CC -B. -o $t/exe2 $t/a.o -Wl,--separate-debug-file -Wl,--no-build-id +readelf -SW $t/exe2 | grep -Fq .gnu_debuglink + +flock $t/exe2 true +gdb $t/exe2 -ex 'list main' -ex 'quit' | grep -Fq printf diff --git a/test/shared-abs-sym.sh b/test/shared-abs-sym.sh new file mode 100755 index 00000000..cc6e0b0b --- /dev/null +++ b/test/shared-abs-sym.sh @@ -0,0 +1,30 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < +extern char foo; +int main() { printf("foo=%p\n", &foo); } +EOF + +cp $t/a.so $t/c.so +$CC -B. -o $t/exe1 $t/d.o $t/c.so -pie || skip +$QEMU $t/exe1 | grep -q 'foo=0x3' || skip +cp $t/b.so $t/c.so +$QEMU $t/exe1 | grep -q 'foo=0x5' + +cp $t/a.so $t/c.so +$CC -B. -o $t/exe2 $t/d.o $t/c.so -no-pie +$QEMU $t/exe2 | grep -q 'foo=0x3' +cp $t/b.so $t/c.so +$QEMU $t/exe1 | grep -q 'foo=0x5' diff --git a/test/elf/shared.sh b/test/shared.sh similarity index 100% rename from test/elf/shared.sh rename to test/shared.sh diff --git a/test/elf/shuffle-sections-seed.sh b/test/shuffle-sections-seed.sh similarity index 100% rename from test/elf/shuffle-sections-seed.sh rename to test/shuffle-sections-seed.sh diff --git a/test/elf/shuffle-sections.sh b/test/shuffle-sections.sh similarity index 100% rename from test/elf/shuffle-sections.sh rename to test/shuffle-sections.sh diff --git a/test/elf/soname.sh b/test/soname.sh similarity index 100% rename from test/elf/soname.sh rename to test/soname.sh diff --git a/test/spare-program-headers.sh b/test/spare-program-headers.sh new file mode 100755 index 00000000..782925a1 --- /dev/null +++ b/test/spare-program-headers.sh @@ -0,0 +1,25 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < +int main() { + printf("Hello world\n"); +} +EOF + +$CC -B. -o $t/exe1 $t/a.o +$QEMU $t/exe1 | grep -q 'Hello world' +[ "$(readelf -Wl $t/exe1 | grep NULL | wc -l)" -eq 0 ] + +$CC -B. -o $t/exe2 $t/a.o -Wl,--spare-program-headers=0 +$QEMU $t/exe2 | grep -q 'Hello world' +[ "$(readelf -Wl $t/exe2 | grep NULL | wc -l)" -eq 0 ] + +$CC -B. -o $t/exe3 $t/a.o -Wl,--spare-program-headers=1 +$QEMU $t/exe3 | grep -q 'Hello world' +[ "$(readelf -Wl $t/exe3 | grep NULL | wc -l)" -eq 1 ] + +$CC -B. -o $t/exe4 $t/a.o -Wl,--spare-program-headers=5 +$QEMU $t/exe4 | grep -q 'Hello world' +[ "$(readelf -Wl $t/exe4 | grep NULL | wc -l)" -eq 5 ] diff --git a/test/elf/start-lib.sh b/test/start-lib.sh similarity index 100% rename from test/elf/start-lib.sh rename to test/start-lib.sh diff --git a/test/elf/start-stop-symbol.sh b/test/start-stop-symbol.sh similarity index 100% rename from test/elf/start-stop-symbol.sh rename to test/start-stop-symbol.sh diff --git a/test/elf/start-stop.sh b/test/start-stop.sh similarity index 100% rename from test/elf/start-stop.sh rename to test/start-stop.sh diff --git a/test/elf/static-archive.sh b/test/static-archive.sh similarity index 100% rename from test/elf/static-archive.sh rename to test/static-archive.sh diff --git a/test/elf/static-pie.sh b/test/static-pie.sh similarity index 100% rename from test/elf/static-pie.sh rename to test/static-pie.sh diff --git a/test/elf/stdout.sh b/test/stdout.sh similarity index 100% rename from test/elf/stdout.sh rename to test/stdout.sh diff --git a/test/elf/strip-debug.sh b/test/strip-debug.sh similarity index 100% rename from test/elf/strip-debug.sh rename to test/strip-debug.sh diff --git a/test/elf/strip.sh b/test/strip.sh similarity index 79% rename from test/elf/strip.sh rename to test/strip.sh index f39cdc39..de6b7d1e 100755 --- a/test/elf/strip.sh +++ b/test/strip.sh @@ -15,7 +15,7 @@ grep -Fq _start $t/log grep -Fq foo $t/log grep -Fq bar $t/log -if [ $MACHINE '!=' riscv32 ] && [ $MACHINE '!=' riscv64 ]; then +if [[ $MACHINE != riscv* ]] && [[ $MACHINE != loongarch* ]]; then grep -Fq .L.baz $t/log fi @@ -25,6 +25,6 @@ readelf --symbols $t/exe > $t/log ! grep -Fq foo $t/log || false ! grep -Fq bar $t/log || false -if [ $MACHINE '!=' riscv32 ] && [ $MACHINE '!=' riscv64 ]; then +if [[ $MACHINE != riscv* ]] && [[ $MACHINE != loongarch* ]]; then ! grep -Fq .L.baz $t/log || false fi diff --git a/test/stt-common.sh b/test/stt-common.sh new file mode 100755 index 00000000..a7b4ae0c --- /dev/null +++ b/test/stt-common.sh @@ -0,0 +1,26 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < /dev/null || skip +int foo; +int bar; +int baz = 42; +EOF + +cat < + +int foo; +int bar = 5; +int baz; + +int main() { + printf("%d %d %d\n", foo, bar, baz); +} +EOF + +$CC -B. -o $t/exe $t/a.o $t/b.o -Wl,--fatal-warnings +$QEMU $t/exe | grep -q '0 5 42' + +readelf --sections $t/exe > $t/log +grep -q '.common .*NOBITS' $t/log diff --git a/test/elf/symbol-rank.sh b/test/symbol-rank.sh similarity index 100% rename from test/elf/symbol-rank.sh rename to test/symbol-rank.sh diff --git a/test/elf/symbol-version-lto.sh b/test/symbol-version-lto.sh similarity index 92% rename from test/elf/symbol-version-lto.sh rename to test/symbol-version-lto.sh index f8b3f2eb..de02e456 100755 --- a/test/elf/symbol-version-lto.sh +++ b/test/symbol-version-lto.sh @@ -1,6 +1,8 @@ #!/bin/bash . $(dirname $0)/common.inc +test_cflags -flto || skip + cat < + +void foo() { printf("foo "); } +void foo2() {} +void foo3() {} + +__asm__(".symver foo2, foo@TEST2"); +__asm__(".symver foo3, foo@TEST3"); +EOF + +cat < $t/b.version +TEST1 { global: foo; }; +TEST2 {}; +TEST3 {}; +EOF + +$CC -B. -o $t/c.so -shared $t/a.o -Wl,--version-script=$t/b.version + +cat < + +void foo(); + +void bar() { printf("bar "); } +void bar2() { foo(); } +void bar3() {} + +__asm__(".symver bar2, bar@TEST2"); +__asm__(".symver bar3, bar@TEST3"); +EOF + +cat < $t/e.version +TEST1 { global: bar; }; +TEST2 {}; +TEST3 {}; +EOF + +$CC -B. -o $t/f.so -shared $t/d.o $t/c.so -Wl,--version-script=$t/e.version + +cat < + +void foo(); +void bar(); + +int main() { + foo(); + bar(); + printf("\n"); +} +EOF + +$CC -B. -o $t/exe $t/g.o $t/f.so $t/c.so +$QEMU $t/exe | grep -q 'foo bar' diff --git a/test/elf/symtab-dso.sh b/test/symtab-dso.sh similarity index 100% rename from test/elf/symtab-dso.sh rename to test/symtab-dso.sh diff --git a/test/elf/symtab-section-symbols.sh b/test/symtab-section-symbols.sh similarity index 100% rename from test/elf/symtab-section-symbols.sh rename to test/symtab-section-symbols.sh diff --git a/test/elf/symtab.sh b/test/symtab.sh similarity index 100% rename from test/elf/symtab.sh rename to test/symtab.sh diff --git a/test/elf/synthetic-symbols.sh b/test/synthetic-symbols.sh similarity index 100% rename from test/elf/synthetic-symbols.sh rename to test/synthetic-symbols.sh diff --git a/test/elf/sysroot-linker-script.sh b/test/sysroot-linker-script.sh similarity index 100% rename from test/elf/sysroot-linker-script.sh rename to test/sysroot-linker-script.sh diff --git a/test/elf/sysroot.sh b/test/sysroot.sh similarity index 100% rename from test/elf/sysroot.sh rename to test/sysroot.sh diff --git a/test/elf/sysroot2.sh b/test/sysroot2.sh similarity index 100% rename from test/elf/sysroot2.sh rename to test/sysroot2.sh diff --git a/test/elf/tail-call.sh b/test/tail-call.sh similarity index 100% rename from test/elf/tail-call.sh rename to test/tail-call.sh diff --git a/test/tbss-only.sh b/test/tbss-only.sh new file mode 100755 index 00000000..a3217800 --- /dev/null +++ b/test/tbss-only.sh @@ -0,0 +1,14 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < $t/log grep -Eq 'thin-archive/d.a\(.*long-long-long-filename.o\)' $t/log -grep -Eq 'thin-archive/d.a\(.*/b.o\)' $t/log +grep -Eq 'thin-archive/d.a\((.*/)?b.o\)' $t/log grep -Fq thin-archive/d.o $t/log $QEMU $t/exe | grep -q 15 diff --git a/test/elf/thread-count.sh b/test/thread-count.sh similarity index 100% rename from test/elf/thread-count.sh rename to test/thread-count.sh diff --git a/test/elf/tls-alignment-multi.sh b/test/tls-alignment-multi.sh similarity index 100% rename from test/elf/tls-alignment-multi.sh rename to test/tls-alignment-multi.sh diff --git a/test/elf/tls-common.sh b/test/tls-common.sh similarity index 88% rename from test/elf/tls-common.sh rename to test/tls-common.sh index b8ef1464..8f475914 100755 --- a/test/elf/tls-common.sh +++ b/test/tls-common.sh @@ -17,4 +17,5 @@ int main() { EOF $CC -B. -o $t/exe $t/a.o $t/b.o +readelf -WS $t/exe | grep -Fq .tls_common $QEMU $t/exe | grep -q '^foo=0$' diff --git a/test/elf/tls-df-static-tls.sh b/test/tls-df-static-tls.sh similarity index 100% rename from test/elf/tls-df-static-tls.sh rename to test/tls-df-static-tls.sh diff --git a/test/elf/tls-dso.sh b/test/tls-dso.sh similarity index 100% rename from test/elf/tls-dso.sh rename to test/tls-dso.sh diff --git a/test/elf/tls-gd-dlopen.sh b/test/tls-gd-dlopen.sh similarity index 100% rename from test/elf/tls-gd-dlopen.sh rename to test/tls-gd-dlopen.sh diff --git a/test/elf/tls-gd-noplt.sh b/test/tls-gd-noplt.sh similarity index 99% rename from test/elf/tls-gd-noplt.sh rename to test/tls-gd-noplt.sh index 627bdf07..5dfb74bc 100755 --- a/test/elf/tls-gd-noplt.sh +++ b/test/tls-gd-noplt.sh @@ -25,7 +25,6 @@ __attribute__((tls_model("global-dynamic"))) static _Thread_local int x5 = 5; int get_x5() { return x5; } EOF - cat < $t/log1 -! grep -Eq 'TLS.?DESC' $t/log1 || false +$OBJDUMP --dynamic-reloc $t/exe1 > $t/log1 +! grep -Eq 'TLS_?DESC' $t/log1 || false -$CC -B. -o $t/exe1 $t/c.o $t/d.o $t/b.so -Wl,--no-relax -$QEMU $t/exe1 | grep -q '^5 5 5$' +$CC -B. -o $t/exe2 $t/c.o $t/d.o $t/b.so -Wl,--no-relax +$QEMU $t/exe2 | grep -q '^5 5 5$' -readelf -Wr $t/exe1 > $t/log2 -grep -Eq 'TLS.?DESC' $t/log2 +$OBJDUMP --dynamic-reloc $t/exe2 > $t/log2 +grep -Eq 'TLS_?DESC' $t/log2 diff --git a/test/elf/tlsdesc-local-dynamic.sh b/test/tlsdesc-local-dynamic.sh similarity index 100% rename from test/elf/tlsdesc-local-dynamic.sh rename to test/tlsdesc-local-dynamic.sh diff --git a/test/elf/tlsdesc-static.sh b/test/tlsdesc-static.sh similarity index 100% rename from test/elf/tlsdesc-static.sh rename to test/tlsdesc-static.sh diff --git a/test/elf/tlsdesc.sh b/test/tlsdesc.sh similarity index 100% rename from test/elf/tlsdesc.sh rename to test/tlsdesc.sh diff --git a/test/trace-symbol-symver.sh b/test/trace-symbol-symver.sh new file mode 100755 index 00000000..b325ac20 --- /dev/null +++ b/test/trace-symbol-symver.sh @@ -0,0 +1,30 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < $t/b.version +VER1 { local: *; }; +VER2 { local: *; }; +VER3 { local: *; }; +EOF + +$CC -B. -o $t/c.so -shared $t/a.o -Wl,--version-script=$t/b.version \ + -Wl,--trace-symbol='foo@VER1' > /dev/null + +cat < /dev/null +$QEMU $t/exe diff --git a/test/elf/trace-symbol.sh b/test/trace-symbol.sh similarity index 100% rename from test/elf/trace-symbol.sh rename to test/trace-symbol.sh diff --git a/test/elf/trace.sh b/test/trace.sh similarity index 100% rename from test/elf/trace.sh rename to test/trace.sh diff --git a/test/undefined-glob-gc-sections.sh b/test/undefined-glob-gc-sections.sh new file mode 100755 index 00000000..e18baec2 --- /dev/null +++ b/test/undefined-glob-gc-sections.sh @@ -0,0 +1,29 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +test_cflags -static || skip + +cat < $t/log2 +grep -q foo $t/log2 +grep -q foobar $t/log2 +! grep -q baz $t/log2 || false diff --git a/test/undefined-glob.sh b/test/undefined-glob.sh new file mode 100755 index 00000000..ac5c775b --- /dev/null +++ b/test/undefined-glob.sh @@ -0,0 +1,35 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +test_cflags -static || skip + +cat < $t/log1 +! grep -q foo $t/log1 || false +! grep -q foobar $t/log1 || false +! grep -q baz $t/log1 || false + +$CC -B. -o $t/exe2 $t/d.a $t/e.o -Wl,--undefined-glob='foo*' +readelf -W --symbols $t/exe2 > $t/log2 +grep -q foo $t/log2 +grep -q foobar $t/log2 +! grep -q baz $t/log2 || false diff --git a/test/elf/undefined.sh b/test/undefined.sh similarity index 100% rename from test/elf/undefined.sh rename to test/undefined.sh diff --git a/test/elf/undefined2.sh b/test/undefined2.sh similarity index 100% rename from test/elf/undefined2.sh rename to test/undefined2.sh diff --git a/test/unkown-section-type.sh b/test/unkown-section-type.sh new file mode 100755 index 00000000..52c34022 --- /dev/null +++ b/test/unkown-section-type.sh @@ -0,0 +1,9 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < /dev/null || skip +.section .my_section,"a",@0x80000000 +EOF + +! $CC -B. -o $t/exe $t/a.o >& $t/log1 +grep -q 'unsupported section type: 0x80000000' $t/log1 diff --git a/test/elf/unresolved-symbols.sh b/test/unresolved-symbols.sh similarity index 100% rename from test/elf/unresolved-symbols.sh rename to test/unresolved-symbols.sh diff --git a/test/unresolved-symbols2.sh b/test/unresolved-symbols2.sh new file mode 100755 index 00000000..566f5065 --- /dev/null +++ b/test/unresolved-symbols2.sh @@ -0,0 +1,10 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < $t/a.ver +VER1 { foo*; }; +VER2 { foo_x; }; +EOF + +cat < $t/log +grep -Fq 'foo_x@@VER2' $t/log +grep -Fq 'foo_y@@VER1' $t/log +grep -Fq 'foo_z@@VER1' $t/log diff --git a/test/version-script21.sh b/test/version-script21.sh new file mode 100755 index 00000000..3d75b762 --- /dev/null +++ b/test/version-script21.sh @@ -0,0 +1,19 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat <<'EOF' > $t/a.ver +VER1 { foo_x; }; +VER2 { foo*; }; +EOF + +cat < $t/log +grep -Fq 'foo_x@@VER1' $t/log +grep -Fq 'foo_y@@VER2' $t/log +grep -Fq 'foo_z@@VER2' $t/log diff --git a/test/version-script22.sh b/test/version-script22.sh new file mode 100755 index 00000000..1b17b4d8 --- /dev/null +++ b/test/version-script22.sh @@ -0,0 +1,15 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat <<'EOF' > $t/a.ver +VER1 { foo*; }; +VER2 { foo*bar*; }; +EOF + +cat < $t/log +grep -Fq 'foo_bar@@VER2' $t/log diff --git a/test/version-script23.sh b/test/version-script23.sh new file mode 100755 index 00000000..3e1dd29d --- /dev/null +++ b/test/version-script23.sh @@ -0,0 +1,15 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat <<'EOF' > $t/a.ver +VER1 { foo\?; }; +EOF + +cat <& /dev/null || skip +.globl "foo?" +"foo?": +EOF + +$CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o +readelf -W --dyn-syms $t/c.so > $t/log +grep -Fq 'foo?@@VER1' $t/log diff --git a/test/elf/version-script3.sh b/test/version-script3.sh similarity index 100% rename from test/elf/version-script3.sh rename to test/version-script3.sh diff --git a/test/elf/version-script4.sh b/test/version-script4.sh similarity index 100% rename from test/elf/version-script4.sh rename to test/version-script4.sh diff --git a/test/elf/version-script5.sh b/test/version-script5.sh similarity index 100% rename from test/elf/version-script5.sh rename to test/version-script5.sh diff --git a/test/elf/version-script6.sh b/test/version-script6.sh similarity index 100% rename from test/elf/version-script6.sh rename to test/version-script6.sh diff --git a/test/elf/version-script7.sh b/test/version-script7.sh similarity index 100% rename from test/elf/version-script7.sh rename to test/version-script7.sh diff --git a/test/elf/version-script8.sh b/test/version-script8.sh similarity index 100% rename from test/elf/version-script8.sh rename to test/version-script8.sh diff --git a/test/elf/version-script9.sh b/test/version-script9.sh similarity index 100% rename from test/elf/version-script9.sh rename to test/version-script9.sh diff --git a/test/elf/version.sh b/test/version.sh similarity index 58% rename from test/elf/version.sh rename to test/version.sh index 51a95bec..9fba3660 100755 --- a/test/elf/version.sh +++ b/test/version.sh @@ -4,10 +4,10 @@ # OneTBB isn't tsan-clean nm mold | grep -q '__tsan_init' && skip -./mold -v | grep -q '[ms]old .*compatible with GNU ld' -./mold --version | grep -q '[ms]old .*compatible with GNU ld' +./mold -v | grep -q 'mold .*compatible with GNU ld' +./mold --version | grep -q 'mold .*compatible with GNU ld' -./mold -V | grep -q '[ms]old .*compatible with GNU ld' +./mold -V | grep -q 'mold .*compatible with GNU ld' ./mold -V | grep -q elf_x86_64 ./mold -V | grep -q elf_i386 @@ -20,10 +20,10 @@ int main() { EOF rm -f $t/exe -$CC -B. -Wl,--version -o $t/exe1 $t/a.o 2>&1 | grep -q '[ms]old' +$CC -B. -Wl,--version -o $t/exe1 $t/a.o 2>&1 | grep -q mold ! [ -f $t/exe1 ] || false -$CC -B. -Wl,-v -o $t/exe2 $t/a.o 2>&1 | grep -q '[ms]old' +$CC -B. -Wl,-v -o $t/exe2 $t/a.o 2>&1 | grep -q mold $QEMU $t/exe2 | grep -q 'Hello world' ! ./mold --v >& $t/log diff --git a/test/elf/versioned-undef.sh b/test/versioned-undef.sh similarity index 100% rename from test/elf/versioned-undef.sh rename to test/versioned-undef.sh diff --git a/test/elf/visibility.sh b/test/visibility.sh similarity index 100% rename from test/elf/visibility.sh rename to test/visibility.sh diff --git a/test/elf/warn-common.sh b/test/warn-common.sh similarity index 100% rename from test/elf/warn-common.sh rename to test/warn-common.sh diff --git a/test/elf/warn-once.sh b/test/warn-once.sh similarity index 83% rename from test/elf/warn-once.sh rename to test/warn-once.sh index 44ab16a3..852fe0e2 100755 --- a/test/elf/warn-once.sh +++ b/test/warn-once.sh @@ -14,4 +14,4 @@ EOF $CC -B. -o $t/exe $t/a.o $t/b.o -Wl,--warn-unresolved-symbols,--warn-once >& $t/log -[ "$(grep 'undefined symbol:.* foo$' $t/log | wc -l)" = 1 ] +[ $(grep 'undefined symbol:.* foo$' $t/log | wc -l) = 1 ] diff --git a/test/elf/warn-symbol-type.sh b/test/warn-symbol-type.sh similarity index 100% rename from test/elf/warn-symbol-type.sh rename to test/warn-symbol-type.sh diff --git a/test/elf/warn-unresolved-symbols.sh b/test/warn-unresolved-symbols.sh similarity index 100% rename from test/elf/warn-unresolved-symbols.sh rename to test/warn-unresolved-symbols.sh diff --git a/test/elf/weak-export-dso.sh b/test/weak-export-dso.sh similarity index 100% rename from test/elf/weak-export-dso.sh rename to test/weak-export-dso.sh diff --git a/test/weak-export-dso2.sh b/test/weak-export-dso2.sh new file mode 100755 index 00000000..5b1dee8e --- /dev/null +++ b/test/weak-export-dso2.sh @@ -0,0 +1,21 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < + +__attribute__((weak)) int foo(); + +int main() { + printf("%d\n", foo ? foo() : 3); +} +EOF + +$CC -B. -o $t/d.so $t/c.o $t/b.so -shared +readelf -W --dyn-syms $t/d.so | grep -q 'WEAK DEFAULT .* UND foo' diff --git a/test/elf/weak-export-exe.sh b/test/weak-export-exe.sh similarity index 100% rename from test/elf/weak-export-exe.sh rename to test/weak-export-exe.sh diff --git a/test/elf/weak-undef-dso.sh b/test/weak-undef-dso.sh similarity index 100% rename from test/elf/weak-undef-dso.sh rename to test/weak-undef-dso.sh diff --git a/test/elf/weak-undef.sh b/test/weak-undef.sh similarity index 100% rename from test/elf/weak-undef.sh rename to test/weak-undef.sh diff --git a/test/elf/weak-undef2.sh b/test/weak-undef2.sh similarity index 100% rename from test/elf/weak-undef2.sh rename to test/weak-undef2.sh diff --git a/test/elf/weak-undef4.sh b/test/weak-undef4.sh similarity index 100% rename from test/elf/weak-undef4.sh rename to test/weak-undef4.sh diff --git a/test/weak-undef5.sh b/test/weak-undef5.sh new file mode 100755 index 00000000..887c8e93 --- /dev/null +++ b/test/weak-undef5.sh @@ -0,0 +1,21 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < +__attribute__((weak)) int foo(); +int main() { + printf("%d\n", foo ? foo() : -1); +} +EOF + +cat < +int foo() { return 2; } +EOF + +$CC -B. -o $t/libfoobar.so $t/b.o -shared +$CC -B. -o $t/exe $t/a.o -Wl,--as-needed -L$t -lfoobar -Wl,-rpath,$t + +readelf --dynamic $t/exe | grep -q 'NEEDED.*libfoobar' +$QEMU $t/exe | grep -q '^2$' diff --git a/test/elf/whole-archive.sh b/test/whole-archive.sh similarity index 61% rename from test/elf/whole-archive.sh rename to test/whole-archive.sh index de5da115..721acf37 100755 --- a/test/elf/whole-archive.sh +++ b/test/whole-archive.sh @@ -14,19 +14,19 @@ ar cr $t/d.a $t/b.o $t/c.o $CC -B. -nostdlib -o $t/exe $t/a.o $t/d.a -readelf --symbols $t/exe > $t/readelf -! grep -q fn1 $t/readelf || false -! grep -q fn2 $t/readelf || false +readelf --symbols $t/exe > $t/log +! grep -q fn1 $t/log || false +! grep -q fn2 $t/log || false $CC -B. -nostdlib -o $t/exe $t/a.o -Wl,--whole-archive $t/d.a -readelf --symbols $t/exe > $t/readelf -grep -q fn1 $t/readelf -grep -q fn2 $t/readelf +readelf --symbols $t/exe > $t/log +grep -q fn1 $t/log +grep -q fn2 $t/log $CC -B. -nostdlib -o $t/exe $t/a.o -Wl,--whole-archive \ -Wl,--no-whole-archive $t/d.a -readelf --symbols $t/exe > $t/readelf -! grep -q fn1 $t/readelf || false -! grep -q fn2 $t/readelf || false +readelf --symbols $t/exe > $t/log +! grep -q fn1 $t/log || false +! grep -q fn2 $t/log || false diff --git a/test/elf/wrap-lto.sh b/test/wrap-lto.sh similarity index 96% rename from test/elf/wrap-lto.sh rename to test/wrap-lto.sh index 1e26af8c..0e2fb52b 100755 --- a/test/elf/wrap-lto.sh +++ b/test/wrap-lto.sh @@ -1,6 +1,8 @@ #!/bin/bash . $(dirname $0)/common.inc +test_cflags -flto || skip + cat < diff --git a/test/elf/wrap.sh b/test/wrap.sh similarity index 100% rename from test/elf/wrap.sh rename to test/wrap.sh diff --git a/test/elf/z-cet-report.sh b/test/z-cet-report.sh similarity index 100% rename from test/elf/z-cet-report.sh rename to test/z-cet-report.sh diff --git a/test/elf/z-defs.sh b/test/z-defs.sh similarity index 100% rename from test/elf/z-defs.sh rename to test/z-defs.sh diff --git a/test/elf/z-dynamic-undefined-weak.sh b/test/z-dynamic-undefined-weak.sh similarity index 100% rename from test/elf/z-dynamic-undefined-weak.sh rename to test/z-dynamic-undefined-weak.sh diff --git a/test/elf/z-max-page-size.sh b/test/z-max-page-size.sh similarity index 100% rename from test/elf/z-max-page-size.sh rename to test/z-max-page-size.sh diff --git a/test/elf/z-nodefaultlib.sh b/test/z-nodefaultlib.sh similarity index 100% rename from test/elf/z-nodefaultlib.sh rename to test/z-nodefaultlib.sh diff --git a/test/elf/z-nodump.sh b/test/z-nodump.sh similarity index 100% rename from test/elf/z-nodump.sh rename to test/z-nodump.sh diff --git a/test/elf/z-now.sh b/test/z-now.sh similarity index 100% rename from test/elf/z-now.sh rename to test/z-now.sh diff --git a/test/elf/z-origin.sh b/test/z-origin.sh similarity index 100% rename from test/elf/z-origin.sh rename to test/z-origin.sh diff --git a/test/z-pack-relative-relocs.sh b/test/z-pack-relative-relocs.sh new file mode 100755 index 00000000..357bb859 --- /dev/null +++ b/test/z-pack-relative-relocs.sh @@ -0,0 +1,21 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < +int main() { + printf("Hello world\n"); +} +EOF + +$CC -o $t/exe1 $t/a.o -pie -Wl,-z,pack-relative-relocs 2> /dev/null || skip +readelf -WS $t/exe1 | grep -Fq .relr.dyn || skip +$QEMU $t/exe1 2> /dev/null | grep -q Hello || skip + +$CC -B. -o $t/exe2 $t/a.o -pie -Wl,-z,pack-relative-relocs +$QEMU $t/exe2 | grep -q Hello + +readelf --dynamic $t/exe2 > $t/log2 +grep -wq RELR $t/log2 +grep -wq RELRSZ $t/log2 +grep -wq RELRENT $t/log2 diff --git a/test/z-rodynamic.sh b/test/z-rodynamic.sh new file mode 100755 index 00000000..44013dad --- /dev/null +++ b/test/z-rodynamic.sh @@ -0,0 +1,12 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < + +__attribute__((section("hello"))) +static char msg[] = "Hello world"; + +int main() { + printf("%s\n", msg); +} +EOF + +$CC -B. -o $t/exe1 $t/a.o +readelf -W --dyn-syms $t/exe1 > $t/log1 +! grep -q __start_hello $t/log1 || false +! grep -q __stop_hello $t/log1 || false + +$CC -B. -o $t/exe2 $t/a.o -Wl,-z,start-stop-visibility=hidden +readelf -W --dyn-syms $t/exe2 > $t/log2 +! grep -q __start_hello $t/log2 || false +! grep -q __stop_hello $t/log2 || false + +$CC -B. -o $t/exe3 $t/a.o -Wl,-z,start-stop-visibility=protected +readelf -W --dyn-syms $t/exe3 > $t/log3 +grep -q __start_hello $t/log3 +grep -q __stop_hello $t/log3 diff --git a/test/elf/z-unknown.sh b/test/z-unknown.sh similarity index 100% rename from test/elf/z-unknown.sh rename to test/z-unknown.sh diff --git a/third-party/blake3/.git-blame-ignore-revs b/third-party/blake3/.git-blame-ignore-revs new file mode 100644 index 00000000..6e814e69 --- /dev/null +++ b/third-party/blake3/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# CMakeLists.txt whitespace fixups +3e14f865d30271c74fc68d417af488ea91b66d48 diff --git a/third-party/blake3/.github/workflows/ci.yml b/third-party/blake3/.github/workflows/ci.yml index c1a88aaf..e93ecb38 100644 --- a/third-party/blake3/.github/workflows/ci.yml +++ b/third-party/blake3/.github/workflows/ci.yml @@ -38,12 +38,10 @@ jobs: ] steps: - - uses: actions/checkout@v3 - - uses: actions-rs/toolchain@v1 + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} - profile: minimal - override: true # Print the compiler version, for debugging. - name: print compiler version run: cargo run --quiet @@ -52,13 +50,17 @@ jobs: - name: print instruction set support run: cargo run --quiet working-directory: ./tools/instruction_set_support - # Default tests plus Rayon and RustCrypto trait implementations. - - run: cargo test --features=rayon,traits-preview + # Default tests plus Rayon and trait implementations. + - run: cargo test --features=rayon,traits-preview,serde,zeroize # Same but with only one thread in the Rayon pool. This can find deadlocks. - name: "again with RAYON_NUM_THREADS=1" - run: cargo test --features=rayon,traits-preview + run: cargo test --features=rayon,traits-preview,serde,zeroize env: RAYON_NUM_THREADS: 1 + # The mmap feature by itself (update_mmap_rayon is omitted). + - run: cargo test --features=mmap + # All public features put together. + - run: cargo test --features=mmap,rayon,traits-preview,serde,zeroize # no_std tests. - run: cargo test --no-default-features @@ -129,6 +131,17 @@ jobs: run: cargo test working-directory: ./reference_impl + # the new guts crate + - name: guts test + run: cargo test --all-features + working-directory: ./rust/guts + - name: guts no_std build + run: cargo build --no-default-features + working-directory: ./rust/guts + - name: guts no_std test # note that rust/guts/src/test.rs still uses libstd + run: cargo test --no-default-features + working-directory: ./rust/guts + b3sum_tests: name: b3sum ${{ matrix.target.name }} ${{ matrix.channel }} runs-on: ${{ matrix.target.os }} @@ -148,16 +161,14 @@ jobs: # The b3sum MSRV is sometimes higher than the blake3 crate's, because # b3sum depends on Clap. We check in the b3sum Cargo.lock, so Clap # update shouldn't randomly break us here. - "1.66.1", + "1.74.1", ] steps: - - uses: actions/checkout@v3 - - uses: actions-rs/toolchain@v1 + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} - profile: minimal - override: true # Test b3sum. - name: test b3sum run: cargo test @@ -177,14 +188,13 @@ jobs: - i686-unknown-linux-musl - armv7-unknown-linux-gnueabihf - aarch64-unknown-linux-gnu - - mips-unknown-linux-gnu + # Big-endian targets. See https://twitter.com/burntsushi5/status/1695483429997945092. + - powerpc64-unknown-linux-gnu + - s390x-unknown-linux-gnu steps: - - uses: actions/checkout@v3 - - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable - run: cargo install cross # Test the portable implementation on everything. - run: cross test --target ${{ matrix.arch }} @@ -210,7 +220,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 # Test the intrinsics-based implementations. - run: make -f Makefile.testing test working-directory: ./c @@ -262,12 +272,10 @@ jobs: strategy: fail-fast: false steps: - - uses: actions/checkout@v3 - - uses: actions-rs/toolchain@v1 + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable with: - toolchain: stable - target: aarch64-apple-darwin - override: true + targets: aarch64-apple-darwin - name: build blake3 run: cargo build --target aarch64-apple-darwin - name: build b3sum @@ -278,7 +286,7 @@ jobs: name: build with the Tiny C Compiler runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: install TCC run: sudo apt-get install -y tcc - name: compile @@ -295,7 +303,7 @@ jobs: name: "compile and test with GCC 5.4" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: addnab/docker-run-action@v3 with: image: gcc:5.4 @@ -308,7 +316,7 @@ jobs: # CMake build test (Library only), current macOS/Linux only. cmake_build: - name: CMake ${{ matrix.os }} + name: CMake ${{ matrix.os }} ${{ matrix.compiler }} runs-on: ${{ matrix.os }} strategy: fail-fast: false @@ -323,8 +331,21 @@ jobs: - os: macOS-latest compiler: msvc steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: CMake generation run: cmake -S c -B c/build -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/target - name: CMake build / install run: cmake --build c/build --target install + + miri_smoketest: + name: Miri smoketest + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@nightly + with: + components: miri + # Currently the test search "miri" only matches "test_miri_smoketest", but + # we might add more. If this accidentally picks up anything incompatible or + # slow, we can narrow it. + - run: cargo miri test miri diff --git a/third-party/blake3/.github/workflows/tag.yml b/third-party/blake3/.github/workflows/tag.yml index 3f7e886b..61be4ff9 100644 --- a/third-party/blake3/.github/workflows/tag.yml +++ b/third-party/blake3/.github/workflows/tag.yml @@ -23,18 +23,16 @@ jobs: ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: "3.x" - run: pip install PyGithub - run: sudo apt-get install musl-tools if: matrix.target.os == 'ubuntu-latest' - - uses: actions-rs/toolchain@v1 + - uses: dtolnay/rust-toolchain@stable with: - toolchain: stable - profile: minimal - - run: rustup target add ${{ matrix.target.rust-target }} + targets: ${{ matrix.target.rust-target }} - name: build b3sum id: build_b3sum run: python -u .github/workflows/build_b3sum.py ${{ matrix.target.rust-target }} diff --git a/third-party/blake3/Cargo.toml b/third-party/blake3/Cargo.toml index 8df13874..55eb8a41 100644 --- a/third-party/blake3/Cargo.toml +++ b/third-party/blake3/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "blake3" -version = "1.4.1" +version = "1.5.1" authors = ["Jack O'Connor ", "Samuel Neves"] description = "the BLAKE3 hash function" repository = "https://github.com/BLAKE3-team/BLAKE3" @@ -23,11 +23,21 @@ neon = [] # --no-default-features, the only way to use the SIMD implementations in this # crate is to enable the corresponding instruction sets statically for the # entire build, with e.g. RUSTFLAGS="-C target-cpu=native". -std = ["digest/std"] +std = [] -# The "rayon" feature (defined below as an optional dependency) enables the -# `Hasher::update_rayon` method, for multithreaded hashing. However, even if -# this feature is enabled, all other APIs remain single-threaded. +# The `rayon` feature (disabled by default, but enabled for docs.rs) adds the +# `update_rayon` and (in combination with `mmap` below) `update_mmap_rayon` +# methods, for multithreaded hashing. However, even if this feature is enabled, +# all other APIs remain single-threaded. +rayon = ["dep:rayon", "std"] + +# The `mmap` feature (disabled by default, but enabled for docs.rs) adds the +# `update_mmap` and (in combination with `rayon` above) `update_mmap_rayon` +# helper methods for memory-mapped IO. +mmap = ["std", "dep:memmap2"] + +# Implement the zeroize::Zeroize trait for types in this crate. +zeroize = ["dep:zeroize", "arrayvec/zeroize"] # This crate implements traits from the RustCrypto project, exposed here as the # "traits-preview" feature. However, these traits aren't stable, and they're @@ -78,24 +88,29 @@ no_avx512 = [] no_neon = [] [package.metadata.docs.rs] -# Document Hasher::update_rayon on docs.rs. -features = ["rayon"] +# Document the rayon/mmap methods and the Serialize/Deserialize/Zeroize impls on docs.rs. +features = ["mmap", "rayon", "serde", "zeroize"] [dependencies] arrayref = "0.3.5" -arrayvec = { version = "0.7.0", default-features = false } +arrayvec = { version = "0.7.4", default-features = false } constant_time_eq = "0.3.0" -rayon = { version = "1.2.1", optional = true } cfg-if = "1.0.0" digest = { version = "0.10.1", features = [ "mac" ], optional = true } +memmap2 = { version = "0.9", optional = true } +rayon = { version = "1.2.1", optional = true } +serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } +zeroize = { version = "1", default-features = false, features = ["zeroize_derive"], optional = true } [dev-dependencies] +hmac = "0.12.0" hex = "0.4.2" -page_size = "0.5.0" +page_size = "0.6.0" rand = "0.8.0" rand_chacha = "0.3.0" reference_impl = { path = "./reference_impl" } -hmac = "0.12.0" +tempfile = "3.8.0" +serde_json = "1.0.107" [build-dependencies] cc = "1.0.4" diff --git a/third-party/blake3/README.md b/third-party/blake3/README.md index a63d5f2c..6b493775 100644 --- a/third-party/blake3/README.md +++ b/third-party/blake3/README.md @@ -201,6 +201,7 @@ Alternatively, it is licensed under the Apache License 2.0. Here's a (non-exhaustive) list of protocols and software that use BLAKE3: * [Alephium](https://github.com/alephium/alephium/blob/master/crypto/src/main/scala/org/alephium/crypto/Blake3.scala) +* [Bazel](https://github.com/bazelbuild/bazel/releases/tag/6.4.0) * [Chia](https://github.com/Chia-Network/chia-blockchain/blob/main/CHANGELOG.md#10beta8-aka-beta-18---2020-07-16) * [IPFS](https://github.com/ipfs/go-verifcid/issues/13) * [Farcaster](https://www.farcaster.xyz/) @@ -211,6 +212,7 @@ Here's a (non-exhaustive) list of protocols and software that use BLAKE3: * [Saito](https://saito.tech/) * [Skale](https://github.com/skalenetwork/skale-consensus/pull/284) * [Solana](https://docs.rs/solana-program/1.9.5/solana_program/blake3/index.html) +* [Tekken 8](https://en.bandainamcoent.eu/tekken/tekken-8) * [Wasmer](https://github.com/wasmerio/wasmer/blob/4f935a8c162bf604df223003e434e4f7ca253688/lib/cache/src/hash.rs#L21) diff --git a/third-party/blake3/b3sum/Cargo.lock b/third-party/blake3/b3sum/Cargo.lock index 2a599a85..2300d3bf 100644 --- a/third-party/blake3/b3sum/Cargo.lock +++ b/third-party/blake3/b3sum/Cargo.lock @@ -4,58 +4,57 @@ version = 3 [[package]] name = "anstream" -version = "0.3.2" +version = "0.6.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" +checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", - "is-terminal", "utf8parse", ] [[package]] name = "anstyle" -version = "1.0.1" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd" +checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" [[package]] name = "anstyle-parse" -version = "0.2.1" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" +checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.0.0" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" dependencies = [ - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "anstyle-wincon" -version = "1.0.1" +version = "3.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" +checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" dependencies = [ "anstyle", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "anyhow" -version = "1.0.71" +version = "1.0.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" +checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" [[package]] name = "arrayref" @@ -69,22 +68,15 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" -[[package]] -name = "autocfg" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" - [[package]] name = "b3sum" -version = "1.4.1" +version = "1.5.1" dependencies = [ "anyhow", "blake3", "clap", "duct", "hex", - "memmap2", "rayon", "tempfile", "wild", @@ -92,43 +84,28 @@ dependencies = [ [[package]] name = "bitflags" -version = "1.3.2" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitflags" -version = "2.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" [[package]] name = "blake3" -version = "1.4.1" +version = "1.5.1" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", - "digest", + "memmap2", "rayon", ] -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - [[package]] name = "cc" -version = "1.0.79" +version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" [[package]] name = "cfg-if" @@ -138,20 +115,19 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.3.11" +version = "4.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d" +checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651" dependencies = [ "clap_builder", "clap_derive", - "once_cell", ] [[package]] name = "clap_builder" -version = "4.3.11" +version = "4.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b" +checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" dependencies = [ "anstream", "anstyle", @@ -162,9 +138,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.3.2" +version = "4.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f" +checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47" dependencies = [ "heck", "proc-macro2", @@ -174,9 +150,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.5.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" +checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "colorchoice" @@ -190,75 +166,36 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" -[[package]] -name = "crossbeam-channel" -version = "0.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" -dependencies = [ - "cfg-if", - "crossbeam-utils", -] - [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" -version = "0.9.15" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset", - "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.16" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "crypto-common" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" -dependencies = [ - "generic-array", - "typenum", -] - -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", - "subtle", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "duct" -version = "0.13.6" +version = "0.13.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37ae3fc31835f74c2a7ceda3aeede378b0ae2e74c8f1c36559fcc9ae2a4e7d3e" +checksum = "e4ab5718d1224b63252cd0c6f74f6480f9ffeb117438a2e0f5cf6d9a4798929c" dependencies = [ "libc", "once_cell", @@ -268,49 +205,25 @@ dependencies = [ [[package]] name = "either" -version = "1.8.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" +checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" [[package]] name = "errno" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" -dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ - "cc", "libc", + "windows-sys 0.52.0", ] [[package]] name = "fastrand" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" -dependencies = [ - "instant", -] - -[[package]] -name = "generic-array" -version = "0.14.7" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "glob" @@ -324,134 +237,72 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" -[[package]] -name = "hermit-abi" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" - [[package]] name = "hex" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" -[[package]] -name = "instant" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "io-lifetimes" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" -dependencies = [ - "hermit-abi", - "libc", - "windows-sys", -] - -[[package]] -name = "is-terminal" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" -dependencies = [ - "hermit-abi", - "rustix 0.38.3", - "windows-sys", -] - [[package]] name = "libc" -version = "0.2.147" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" - -[[package]] -name = "linux-raw-sys" -version = "0.3.8" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "linux-raw-sys" -version = "0.4.3" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "memmap2" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" -dependencies = [ - "libc", -] - -[[package]] -name = "memoffset" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" -dependencies = [ - "autocfg", -] - -[[package]] -name = "num_cpus" -version = "1.16.0" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322" dependencies = [ - "hermit-abi", "libc", ] [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "os_pipe" -version = "1.1.4" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ae859aa07428ca9a929b936690f8b12dc5f11dd8c6992a18ca93919f28bc177" +checksum = "57119c3b893986491ec9aa85056780d3a0f3cf4da7cc09dd3650dbd6c6738fb9" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "proc-macro2" -version = "1.0.63" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb" +checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.29" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] [[package]] name = "rayon" -version = "1.7.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" +checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd" dependencies = [ "either", "rayon-core", @@ -459,58 +310,27 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.11.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ - "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", - "num_cpus", -] - -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags 1.3.2", ] [[package]] name = "rustix" -version = "0.37.23" +version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d69718bf81c6127a49dc64e44a742e8bb9213c0ff8869a22c308f84c1d4ab06" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ - "bitflags 1.3.2", + "bitflags", "errno", - "io-lifetimes", "libc", - "linux-raw-sys 0.3.8", - "windows-sys", + "linux-raw-sys", + "windows-sys 0.52.0", ] -[[package]] -name = "rustix" -version = "0.38.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac5ffa1efe7548069688cd7028f32591853cd7b5b756d41bcffd2353e4fc75b4" -dependencies = [ - "bitflags 2.3.3", - "errno", - "libc", - "linux-raw-sys 0.4.3", - "windows-sys", -] - -[[package]] -name = "scopeguard" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" - [[package]] name = "shared_child" version = "1.0.0" @@ -523,21 +343,15 @@ dependencies = [ [[package]] name = "strsim" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" - -[[package]] -name = "subtle" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01" [[package]] name = "syn" -version = "2.0.23" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -546,39 +360,31 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.6.0" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ - "autocfg", "cfg-if", "fastrand", - "redox_syscall", - "rustix 0.37.23", - "windows-sys", + "rustix", + "windows-sys 0.52.0", ] [[package]] name = "terminal_size" -version = "0.2.6" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237" +checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" dependencies = [ - "rustix 0.37.23", - "windows-sys", + "rustix", + "windows-sys 0.48.0", ] -[[package]] -name = "typenum" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" - [[package]] name = "unicode-ident" -version = "1.0.10" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "utf8parse" @@ -586,17 +392,11 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" -[[package]] -name = "version_check" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" - [[package]] name = "wild" -version = "2.1.0" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05b116685a6be0c52f5a103334cbff26db643826c7b3735fc0a3ba9871310a74" +checksum = "a3131afc8c575281e1e80f36ed6a092aa502c08b18ed7524e86fbbb12bb410e1" dependencies = [ "glob", ] @@ -629,62 +429,128 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.4", ] [[package]] name = "windows-targets" -version = "0.48.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +dependencies = [ + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" [[package]] name = "windows_aarch64_msvc" -version = "0.48.0" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" [[package]] name = "windows_i686_gnu" -version = "0.48.0" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" [[package]] name = "windows_i686_msvc" -version = "0.48.0" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" [[package]] name = "windows_x86_64_gnu" -version = "0.48.0" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" [[package]] name = "windows_x86_64_gnullvm" -version = "0.48.0" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" [[package]] name = "windows_x86_64_msvc" -version = "0.48.0" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" diff --git a/third-party/blake3/b3sum/Cargo.toml b/third-party/blake3/b3sum/Cargo.toml index 02c9405f..812ed224 100644 --- a/third-party/blake3/b3sum/Cargo.toml +++ b/third-party/blake3/b3sum/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "b3sum" -version = "1.4.1" +version = "1.5.1" authors = ["Jack O'Connor "] description = "a command line implementation of the BLAKE3 hash function" repository = "https://github.com/BLAKE3-team/BLAKE3" @@ -15,10 +15,9 @@ pure = ["blake3/pure"] [dependencies] anyhow = "1.0.25" -blake3 = { version = "1", path = "..", features = ["rayon"] } +blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] } clap = { version = "4.0.8", features = ["derive", "wrap_help"] } hex = "0.4.0" -memmap2 = "0.7.0" rayon = "1.2.1" wild = "2.0.3" diff --git a/third-party/blake3/b3sum/src/main.rs b/third-party/blake3/b3sum/src/main.rs index fd35f686..228737ff 100644 --- a/third-party/blake3/b3sum/src/main.rs +++ b/third-party/blake3/b3sum/src/main.rs @@ -163,125 +163,22 @@ impl Args { } } -enum Input { - Mmap(io::Cursor), - File(File), - Stdin, -} - -impl Input { - // Open an input file, using mmap if appropriate. "-" means stdin. Note - // that this convention applies both to command line arguments, and to - // filepaths that appear in a checkfile. - fn open(path: &Path, args: &Args) -> Result { - if path == Path::new("-") { - if args.keyed() { - bail!("Cannot open `-` in keyed mode"); - } - return Ok(Self::Stdin); - } - let file = File::open(path)?; - if !args.no_mmap() { - if let Some(mmap) = maybe_memmap_file(&file)? { - return Ok(Self::Mmap(io::Cursor::new(mmap))); - } - } - Ok(Self::File(file)) - } - - fn hash(&mut self, args: &Args) -> Result { - let mut hasher = args.base_hasher.clone(); - match self { - // The fast path: If we mmapped the file successfully, hash using - // multiple threads. This doesn't work on stdin, or on some files, - // and it can also be disabled with --no-mmap. - Self::Mmap(cursor) => { - hasher.update_rayon(cursor.get_ref()); - } - // The slower paths, for stdin or files we didn't/couldn't mmap. - // This is currently all single-threaded. Doing multi-threaded - // hashing without memory mapping is tricky, since all your worker - // threads have to stop every time you refill the buffer, and that - // ends up being a lot of overhead. To solve that, we need a more - // complicated double-buffering strategy where a background thread - // fills one buffer while the worker threads are hashing the other - // one. We might implement that in the future, but since this is - // the slow path anyway, it's not high priority. - Self::File(file) => { - copy_wide(file, &mut hasher)?; - } - Self::Stdin => { - let stdin = io::stdin(); - let lock = stdin.lock(); - copy_wide(lock, &mut hasher)?; - } - } - let mut output_reader = hasher.finalize_xof(); - output_reader.set_position(args.seek()); - Ok(output_reader) - } -} - -impl Read for Input { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - match self { - Self::Mmap(cursor) => cursor.read(buf), - Self::File(file) => file.read(buf), - Self::Stdin => io::stdin().read(buf), +fn hash_path(args: &Args, path: &Path) -> Result { + let mut hasher = args.base_hasher.clone(); + if path == Path::new("-") { + if args.keyed() { + bail!("Cannot open `-` in keyed mode"); } - } -} - -// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets -// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms -// can support at least 64 KiB, and there's some performance benefit to using -// bigger reads, so that's what we use here. -fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result { - let mut buffer = [0; 65536]; - let mut total = 0; - loop { - match reader.read(&mut buffer) { - Ok(0) => return Ok(total), - Ok(n) => { - hasher.update(&buffer[..n]); - total += n as u64; - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => return Err(e), - } - } -} - -// Mmap a file, if it looks like a good idea. Return None in cases where we -// know mmap will fail, or if the file is short enough that mmapping isn't -// worth it. However, if we do try to mmap and it fails, return the error. -fn maybe_memmap_file(file: &File) -> Result> { - let metadata = file.metadata()?; - let file_size = metadata.len(); - Ok(if !metadata.is_file() { - // Not a real file. - None - } else if file_size > isize::max_value() as u64 { - // Too long to safely map. - // https://github.com/danburkert/memmap-rs/issues/69 - None - } else if file_size == 0 { - // Mapping an empty file currently fails. - // https://github.com/danburkert/memmap-rs/issues/72 - None - } else if file_size < 16 * 1024 { - // Mapping small files is not worth it. - None + hasher.update_reader(io::stdin().lock())?; + } else if args.no_mmap() { + hasher.update_reader(File::open(path)?)?; } else { - // Explicitly set the length of the memory map, so that filesystem - // changes can't race to violate the invariants we just checked. - let map = unsafe { - memmap2::MmapOptions::new() - .len(file_size as usize) - .map(file)? - }; - Some(map) - }) + // The fast path: Try to mmap the file and hash it with multiple threads. + hasher.update_mmap_rayon(path)?; + } + let mut output_reader = hasher.finalize_xof(); + output_reader.set_position(args.seek()); + Ok(output_reader) } fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> { @@ -477,8 +374,7 @@ fn parse_check_line(mut line: &str) -> Result { } fn hash_one_input(path: &Path, args: &Args) -> Result<()> { - let mut input = Input::open(path, args)?; - let output = input.hash(args)?; + let output = hash_path(args, path)?; if args.raw() { write_raw_output(output, args)?; return Ok(()); @@ -522,15 +418,13 @@ fn check_one_line(line: &str, args: &Args) -> bool { } else { file_string }; - let hash_result: Result = Input::open(&file_path, args) - .and_then(|mut input| input.hash(args)) - .map(|mut hash_output| { + let found_hash: blake3::Hash; + match hash_path(args, &file_path) { + Ok(mut output) => { let mut found_hash_bytes = [0; blake3::OUT_LEN]; - hash_output.fill(&mut found_hash_bytes); - found_hash_bytes.into() - }); - let found_hash: blake3::Hash = match hash_result { - Ok(hash) => hash, + output.fill(&mut found_hash_bytes); + found_hash = found_hash_bytes.into(); + } Err(e) => { println!("{}: FAILED ({})", file_string, e); return false; @@ -549,8 +443,18 @@ fn check_one_line(line: &str, args: &Args) -> bool { } fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> Result<()> { - let checkfile_input = Input::open(path, args)?; - let mut bufreader = io::BufReader::new(checkfile_input); + let mut file; + let stdin; + let mut stdin_lock; + let mut bufreader: io::BufReader<&mut dyn Read>; + if path == Path::new("-") { + stdin = io::stdin(); + stdin_lock = stdin.lock(); + bufreader = io::BufReader::new(&mut stdin_lock); + } else { + file = File::open(path)?; + bufreader = io::BufReader::new(&mut file); + } let mut line = String::new(); loop { line.clear(); diff --git a/third-party/blake3/build.rs b/third-party/blake3/build.rs index ac1d6a64..a5dfd062 100644 --- a/third-party/blake3/build.rs +++ b/third-party/blake3/build.rs @@ -60,6 +60,20 @@ fn is_armv7() -> bool { target_components()[0] == "armv7" } +fn endianness() -> String { + let endianness = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap(); + assert!(endianness == "little" || endianness == "big"); + endianness +} + +fn is_little_endian() -> bool { + endianness() == "little" +} + +fn is_big_endian() -> bool { + endianness() == "big" +} + // Windows targets may be using the MSVC toolchain or the GNU toolchain. The // right compiler flags to use depend on the toolchain. (And we don't want to // use flag_if_supported, because we don't want features to be silently @@ -253,7 +267,13 @@ fn main() -> Result<(), Box> { } } - if (is_arm() && is_neon()) || (!is_no_neon() && !is_pure() && is_aarch64()) { + if is_neon() && is_big_endian() { + panic!("The NEON implementation doesn't support big-endian ARM.") + } + + if (is_arm() && is_neon()) + || (!is_no_neon() && !is_pure() && is_aarch64() && is_little_endian()) + { println!("cargo:rustc-cfg=blake3_neon"); build_neon_c_intrinsics(); } diff --git a/third-party/blake3/c/CMakeLists.txt b/third-party/blake3/c/CMakeLists.txt index 3190effa..3a3b232d 100644 --- a/third-party/blake3/c/CMakeLists.txt +++ b/third-party/blake3/c/CMakeLists.txt @@ -1,15 +1,23 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.9 FATAL_ERROR) + +# respect C_EXTENSIONS OFF without explicitly setting C_STANDARD +if (POLICY CMP0128) + cmake_policy(SET CMP0128 NEW) +endif() project(libblake3 - VERSION 1.4.1 + VERSION 1.5.1 DESCRIPTION "BLAKE3 C implementation" LANGUAGES C ASM ) -include(CheckCCompilerFlag) include(FeatureSummary) include(GNUInstallDirs) +# architecture lists for which to enable assembly / SIMD sources +set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) +set(BLAKE3_X86_NAMES i686 x86 X86) +set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a) # default SIMD compiler flag configuration (can be overriden by toolchains or CLI) if(MSVC) set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2") @@ -25,11 +33,13 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1") set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2") set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512") + + if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES + AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + # 32-bit ARMv8 needs NEON to be enabled explicitly + set(BLAKE3_CFLAGS_NEON "-mfpu=neon" CACHE STRING "the compiler flags to enable NEON") + endif() endif() -# architecture lists for which to enable assembly / SIMD sources -set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) -set(BLAKE3_X86_NAMES i686 x86 X86) -set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a) # library target add_library(blake3 @@ -42,26 +52,40 @@ add_library(BLAKE3::blake3 ALIAS blake3) # library configuration set(BLAKE3_PKGCONFIG_CFLAGS) if (BUILD_SHARED_LIBS) - target_compile_definitions(blake3 + target_compile_definitions(blake3 PUBLIC BLAKE3_DLL PRIVATE BLAKE3_DLL_EXPORTS ) list(APPEND BLAKE3_PKGCONFIG_CFLAGS -DBLAKE3_DLL) endif() -target_include_directories(blake3 PUBLIC $) +target_include_directories(blake3 PUBLIC + $ + $ +) set_target_properties(blake3 PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 0 C_VISIBILITY_PRESET hidden + C_EXTENSIONS OFF ) +target_compile_features(blake3 PUBLIC c_std_99) +# ensure C_EXTENSIONS OFF is respected without overriding CMAKE_C_STANDARD +# which may be set by the user or toolchain file +if (NOT POLICY CMP0128 AND NOT DEFINED CMAKE_C_STANDARD) + set_target_properties(blake3 PROPERTIES C_STANDARD 99) +endif() # optional SIMD sources macro(BLAKE3_DISABLE_SIMD) set(BLAKE3_SIMD_AMD64_ASM OFF) set(BLAKE3_SIMD_X86_INTRINSICS OFF) set(BLAKE3_SIMD_NEON_INTRINSICS OFF) - set_source_files_properties(blake3_dispatch.c PROPERTIES - COMPILE_DEFINITIONS BLAKE3_USE_NEON=0;BLAKE3_NO_SSE2;BLAKE3_NO_SSE41;BLAKE3_NO_AVX2;BLAKE3_NO_AVX512 + target_compile_definitions(blake3 PRIVATE + BLAKE3_USE_NEON=0 + BLAKE3_NO_SSE2 + BLAKE3_NO_SSE41 + BLAKE3_NO_AVX2 + BLAKE3_NO_AVX512 ) endmacro() @@ -100,7 +124,7 @@ if(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES OR BLAKE3_USE_AMD64_ASM) BLAKE3_DISABLE_SIMD() endif() - else() + else() BLAKE3_DISABLE_SIMD() endif() @@ -122,22 +146,19 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES OR BLAKE3_USE_X86_INTRIN set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}") set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}") -elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES - OR ((ANDROID_ABI STREQUAL "armeabi-v7a" - OR BLAKE3_USE_NEON_INTRINSICS) - AND (DEFINED BLAKE3_CFLAGS_NEON - OR CMAKE_SIZEOF_VOID_P EQUAL 8))) +elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES + OR ANDROID_ABI STREQUAL "armeabi-v7a" + OR BLAKE3_USE_NEON_INTRINSICS) + AND (DEFINED BLAKE3_CFLAGS_NEON + OR CMAKE_SIZEOF_VOID_P EQUAL 8)) set(BLAKE3_SIMD_NEON_INTRINSICS ON) target_sources(blake3 PRIVATE blake3_neon.c ) - target_compile_options(blake3 PRIVATE -DBLAKE3_USE_NEON=1) - - check_c_compiler_flag(-mfpu=neon BLAKE3_MFPU_NEON_SUPPORTED) - if (BLAKE3_MFPU_NEON_SUPPORTED) - target_compile_options(blake3 PRIVATE -mfpu=neon) - endif() + target_compile_definitions(blake3 PRIVATE + BLAKE3_USE_NEON=1 + ) if (DEFINED BLAKE3_CFLAGS_NEON) set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}") diff --git a/third-party/blake3/c/blake3.c b/third-party/blake3/c/blake3.c index 692f4b02..1b44c719 100644 --- a/third-party/blake3/c/blake3.c +++ b/third-party/blake3/c/blake3.c @@ -341,21 +341,24 @@ INLINE void compress_subtree_to_parent_node( size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array); assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); - - // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because + // as we just asserted, num_cvs will always be <=2 in that case. But GCC + // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is + // set then it emits incorrect warnings here. We tried a few different + // hacks to silence these, but in the end our hacks just produced different + // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of + // desperation, we ifdef out this entire loop when we know it's not needed. +#if MAX_SIMD_DEGREE_OR_2 > 2 + // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input, // compress_subtree_wide() returns more than 2 chaining values. Condense // them into 2 by forming parent nodes repeatedly. uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; - // The second half of this loop condition is always true, and we just - // asserted it above. But GCC can't tell that it's always true, and if NDEBUG - // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious - // warnings here. GCC 8.5 is particularly sensitive, so if you're changing - // this code, test it against that version. - while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) { + while (num_cvs > 2) { num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); } +#endif memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); } diff --git a/third-party/blake3/c/blake3.h b/third-party/blake3/c/blake3.h index 21e0d7b9..48284e50 100644 --- a/third-party/blake3/c/blake3.h +++ b/third-party/blake3/c/blake3.h @@ -30,7 +30,7 @@ extern "C" { #endif -#define BLAKE3_VERSION_STRING "1.4.1" +#define BLAKE3_VERSION_STRING "1.5.1" #define BLAKE3_KEY_LEN 32 #define BLAKE3_OUT_LEN 32 #define BLAKE3_BLOCK_LEN 64 diff --git a/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml b/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml index fff9f416..c1aee32e 100644 --- a/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml +++ b/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml @@ -20,9 +20,9 @@ neon = [] [dev-dependencies] arrayref = "0.3.5" arrayvec = { version = "0.7.0", default-features = false } -page_size = "0.4.1" -rand = "0.7.2" -rand_chacha = "0.2.1" +page_size = "0.6.0" +rand = "0.8.5" +rand_chacha = "0.3.1" reference_impl = { path = "../../reference_impl" } [build-dependencies] diff --git a/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs b/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs index 1fc077c8..0730d930 100644 --- a/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs +++ b/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs @@ -485,7 +485,7 @@ fn test_fuzz_hasher() { let mut total_input = 0; // For each test, write 3 inputs of random length. for _ in 0..3 { - let input_len = rng.gen_range(0, INPUT_MAX + 1); + let input_len = rng.gen_range(0..INPUT_MAX + 1); dbg!(input_len); let input = &input_buf[total_input..][..input_len]; hasher.update(input); diff --git a/third-party/blake3/c/blake3_dispatch.c b/third-party/blake3/c/blake3_dispatch.c index 2ab0093e..af6c3dad 100644 --- a/third-party/blake3/c/blake3_dispatch.c +++ b/third-party/blake3/c/blake3_dispatch.c @@ -6,6 +6,7 @@ #if defined(IS_X86) #if defined(_MSC_VER) +#include #include #elif defined(__GNUC__) #include @@ -14,6 +15,32 @@ #endif #endif +#if !defined(BLAKE3_ATOMICS) +#if defined(__has_include) +#if __has_include() && !defined(_MSC_VER) +#define BLAKE3_ATOMICS 1 +#else +#define BLAKE3_ATOMICS 0 +#endif /* __has_include() && !defined(_MSC_VER) */ +#else +#define BLAKE3_ATOMICS 0 +#endif /* defined(__has_include) */ +#endif /* BLAKE3_ATOMICS */ + +#if BLAKE3_ATOMICS +#define ATOMIC_INT _Atomic int +#define ATOMIC_LOAD(x) x +#define ATOMIC_STORE(x, y) x = y +#elif defined(_MSC_VER) +#define ATOMIC_INT LONG +#define ATOMIC_LOAD(x) InterlockedOr(&x, 0) +#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y) +#else +#define ATOMIC_INT int +#define ATOMIC_LOAD(x) x +#define ATOMIC_STORE(x, y) x = y +#endif + #define MAYBE_UNUSED(x) (void)((x)) #if defined(IS_X86) @@ -76,7 +103,7 @@ enum cpu_feature { #if !defined(BLAKE3_TESTING) static /* Allow the variable to be controlled manually for testing */ #endif - enum cpu_feature g_cpu_features = UNDEFINED; + ATOMIC_INT g_cpu_features = UNDEFINED; #if !defined(BLAKE3_TESTING) static @@ -84,14 +111,16 @@ static enum cpu_feature get_cpu_features(void) { - if (g_cpu_features != UNDEFINED) { - return g_cpu_features; + /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */ + enum cpu_feature features = ATOMIC_LOAD(g_cpu_features); + if (features != UNDEFINED) { + return features; } else { #if defined(IS_X86) uint32_t regs[4] = {0}; uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; (void)edx; - enum cpu_feature features = 0; + features = 0; cpuid(regs, 0); const int max_id = *eax; cpuid(regs, 1); @@ -124,7 +153,7 @@ static } } } - g_cpu_features = features; + ATOMIC_STORE(g_cpu_features, features); return features; #else /* How to detect NEON? */ diff --git a/third-party/blake3/c/blake3_impl.h b/third-party/blake3/c/blake3_impl.h index 3ba9ceb0..beab5cf5 100644 --- a/third-party/blake3/c/blake3_impl.h +++ b/third-party/blake3/c/blake3_impl.h @@ -51,7 +51,11 @@ enum blake3_flags { #if !defined(BLAKE3_USE_NEON) // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness #if defined(IS_AARCH64) - #define BLAKE3_USE_NEON 1 + #if defined(__ARM_BIG_ENDIAN) + #define BLAKE3_USE_NEON 0 + #else + #define BLAKE3_USE_NEON 1 + #endif #else #define BLAKE3_USE_NEON 0 #endif diff --git a/third-party/blake3/c/blake3_neon.c b/third-party/blake3/c/blake3_neon.c index 8a818fc7..90bdd572 100644 --- a/third-party/blake3/c/blake3_neon.c +++ b/third-party/blake3/c/blake3_neon.c @@ -10,14 +10,12 @@ INLINE uint32x4_t loadu_128(const uint8_t src[16]) { // vld1q_u32 has alignment requirements. Don't use it. - uint32x4_t x; - memcpy(&x, src, 16); - return x; + return vreinterpretq_u32_u8(vld1q_u8(src)); } INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) { // vst1q_u32 has alignment requirements. Don't use it. - memcpy(dest, &src, 16); + vst1q_u8(dest, vreinterpretq_u8_u32(src)); } INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) { diff --git a/third-party/blake3/rust/guts/Cargo.toml b/third-party/blake3/rust/guts/Cargo.toml new file mode 100644 index 00000000..ebcf77fd --- /dev/null +++ b/third-party/blake3/rust/guts/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "blake3_guts" +version = "0.0.0" +authors = ["Jack O'Connor ", "Samuel Neves"] +description = "low-level building blocks for the BLAKE3 hash function" +repository = "https://github.com/BLAKE3-team/BLAKE3" +license = "CC0-1.0 OR Apache-2.0" +documentation = "https://docs.rs/blake3_guts" +readme = "readme.md" +edition = "2021" + +[dev-dependencies] +hex = "0.4.3" +reference_impl = { path = "../../reference_impl" } + +[features] +default = ["std"] +std = [] diff --git a/third-party/blake3/rust/guts/readme.md b/third-party/blake3/rust/guts/readme.md new file mode 100644 index 00000000..4957816d --- /dev/null +++ b/third-party/blake3/rust/guts/readme.md @@ -0,0 +1,80 @@ +# The BLAKE3 Guts API + +## Introduction + +This [`blake3_guts`](https://crates.io/crates/blake3_guts) sub-crate contains +low-level, high-performance, platform-specific implementations of the BLAKE3 +compression function. This API is complicated and unsafe, and this crate will +never have a stable release. Most callers should instead use the +[`blake3`](https://crates.io/crates/blake3) crate, which will eventually depend +on this one internally. + +The code you see here (as of January 2024) is an early stage of a large planned +refactor. The motivation for this refactor is a couple of missing features in +both the Rust and C implementations: + +- The output side + ([`OutputReader`](https://docs.rs/blake3/latest/blake3/struct.OutputReader.html) + in Rust) doesn't take advantage of the most important SIMD optimizations that + compute multiple blocks in parallel. This blocks any project that wants to + use the BLAKE3 XOF as a stream cipher + ([[1]](https://github.com/oconnor663/bessie), + [[2]](https://github.com/oconnor663/blake3_aead)). +- Low-level callers like [Bao](https://github.com/oconnor663/bao) that need + interior nodes of the tree also don't get those SIMD optimizations. They have + to use a slow, minimalistic, unstable, doc-hidden module [(also called + `guts`)](https://github.com/BLAKE3-team/BLAKE3/blob/master/src/guts.rs). + +The difficulty with adding those features is that they require changes to all +of our optimized assembly and C intrinsics code. That's a couple dozen +different files that are large, platform-specific, difficult to understand, and +full of duplicated code. The higher-level Rust and C implementations of BLAKE3 +both depend on these files and will need to coordinate changes. + +At the same time, it won't be long before we add support for more platforms: + +- RISCV vector extensions +- ARM SVE +- WebAssembly SIMD + +It's important to get this refactor done before new platforms make it even +harder to do. + +## The private guts API + +This is the API that each platform reimplements, so we want it to be as simple +as possible apart from the high-performance work it needs to do. It's +completely `unsafe`, and inputs and outputs are raw pointers that are allowed +to alias (this matters for `hash_parents`, see below). + +- `degree` +- `compress` + - The single compression function, for short inputs and odd-length tails. +- `hash_chunks` +- `hash_parents` +- `xof` +- `xof_xor` + - As `xof` but XOR'ing the result into the output buffer. +- `universal_hash` + - This is a new construction specifically to support + [BLAKE3-AEAD](https://github.com/oconnor663/blake3_aead). Some + implementations might just stub it out with portable code. + +## The public guts API + +This is the API that this crate exposes to callers, i.e. to the main `blake3` +crate. It's a thin, portable layer on top of the private API above. The Rust +version of this API is memory-safe. + +- `degree` +- `compress` +- `hash_chunks` +- `hash_parents` + - This handles most levels of the tree, where we keep hashing SIMD_DEGREE + parents at a time. +- `reduce_parents` + - This uses the same `hash_parents` private API, but it handles the top + levels of the tree where we reduce in-place to the root parent node. +- `xof` +- `xof_xor` +- `universal_hash` diff --git a/third-party/blake3/rust/guts/src/lib.rs b/third-party/blake3/rust/guts/src/lib.rs new file mode 100644 index 00000000..e9b4914b --- /dev/null +++ b/third-party/blake3/rust/guts/src/lib.rs @@ -0,0 +1,1000 @@ +//! # The BLAKE3 Guts API +//! +//! See `readme.md`. +//! +//! The main entrypoint into this crate is [`DETECTED_IMPL`], which is a global [`Implementation`] +//! that atomically initializes itself the first time you use it. +//! +//! # Example +//! +//! ```rust +//! use blake3_guts::{TransposedVectors, DETECTED_IMPL, IV_BYTES, PARENT, ROOT}; +//! +//! // Hash an input of exactly two chunks. +//! let input = [0u8; 2048]; +//! let mut outputs = TransposedVectors::new(); +//! let (left_outputs, _) = DETECTED_IMPL.split_transposed_vectors(&mut outputs); +//! DETECTED_IMPL.hash_chunks( +//! &input, +//! &IV_BYTES, +//! 0, // counter +//! 0, // flags +//! left_outputs, +//! ); +//! let root_node = outputs.extract_parent_node(0); +//! let hash = DETECTED_IMPL.compress( +//! &root_node, +//! 64, // block_len +//! &IV_BYTES, +//! 0, // counter +//! PARENT | ROOT, +//! ); +//! +//! // Compute the same hash using the reference implementation. +//! let mut reference_hasher = reference_impl::Hasher::new(); +//! reference_hasher.update(&input); +//! let mut expected_hash = [0u8; 32]; +//! reference_hasher.finalize(&mut expected_hash); +//! +//! assert_eq!(hash, expected_hash); +//! ``` + +// Tests always require libstd. +#![cfg_attr(all(not(feature = "std"), not(test)), no_std)] + +use core::cmp; +use core::marker::PhantomData; +use core::mem; +use core::ptr; +use core::sync::atomic::{AtomicPtr, Ordering::Relaxed}; + +pub mod portable; + +#[cfg(test)] +mod test; + +pub const OUT_LEN: usize = 32; +pub const BLOCK_LEN: usize = 64; +pub const CHUNK_LEN: usize = 1024; +pub const WORD_LEN: usize = 4; +pub const UNIVERSAL_HASH_LEN: usize = 16; + +pub const CHUNK_START: u32 = 1 << 0; +pub const CHUNK_END: u32 = 1 << 1; +pub const PARENT: u32 = 1 << 2; +pub const ROOT: u32 = 1 << 3; +pub const KEYED_HASH: u32 = 1 << 4; +pub const DERIVE_KEY_CONTEXT: u32 = 1 << 5; +pub const DERIVE_KEY_MATERIAL: u32 = 1 << 6; + +pub const IV: CVWords = [ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +]; +pub const IV_BYTES: CVBytes = le_bytes_from_words_32(&IV); + +pub const MSG_SCHEDULE: [[usize; 16]; 7] = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], + [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], + [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], + [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], + [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], + [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], +]; + +// never less than 2 +pub const MAX_SIMD_DEGREE: usize = 2; + +pub type CVBytes = [u8; 32]; +pub type CVWords = [u32; 8]; +pub type BlockBytes = [u8; 64]; +pub type BlockWords = [u32; 16]; + +pub static DETECTED_IMPL: Implementation = Implementation::new( + degree_init, + compress_init, + hash_chunks_init, + hash_parents_init, + xof_init, + xof_xor_init, + universal_hash_init, +); + +fn detect() -> Implementation { + portable::implementation() +} + +fn init_detected_impl() { + let detected = detect(); + + DETECTED_IMPL + .degree_ptr + .store(detected.degree_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .compress_ptr + .store(detected.compress_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .hash_chunks_ptr + .store(detected.hash_chunks_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .hash_parents_ptr + .store(detected.hash_parents_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .xof_ptr + .store(detected.xof_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .xof_xor_ptr + .store(detected.xof_xor_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .universal_hash_ptr + .store(detected.universal_hash_ptr.load(Relaxed), Relaxed); +} + +pub struct Implementation { + degree_ptr: AtomicPtr<()>, + compress_ptr: AtomicPtr<()>, + hash_chunks_ptr: AtomicPtr<()>, + hash_parents_ptr: AtomicPtr<()>, + xof_ptr: AtomicPtr<()>, + xof_xor_ptr: AtomicPtr<()>, + universal_hash_ptr: AtomicPtr<()>, +} + +impl Implementation { + const fn new( + degree_fn: DegreeFn, + compress_fn: CompressFn, + hash_chunks_fn: HashChunksFn, + hash_parents_fn: HashParentsFn, + xof_fn: XofFn, + xof_xor_fn: XofFn, + universal_hash_fn: UniversalHashFn, + ) -> Self { + Self { + degree_ptr: AtomicPtr::new(degree_fn as *mut ()), + compress_ptr: AtomicPtr::new(compress_fn as *mut ()), + hash_chunks_ptr: AtomicPtr::new(hash_chunks_fn as *mut ()), + hash_parents_ptr: AtomicPtr::new(hash_parents_fn as *mut ()), + xof_ptr: AtomicPtr::new(xof_fn as *mut ()), + xof_xor_ptr: AtomicPtr::new(xof_xor_fn as *mut ()), + universal_hash_ptr: AtomicPtr::new(universal_hash_fn as *mut ()), + } + } + + #[inline] + fn degree_fn(&self) -> DegreeFn { + unsafe { mem::transmute(self.degree_ptr.load(Relaxed)) } + } + + #[inline] + pub fn degree(&self) -> usize { + let degree = unsafe { self.degree_fn()() }; + debug_assert!(degree >= 2); + debug_assert!(degree <= MAX_SIMD_DEGREE); + debug_assert_eq!(1, degree.count_ones(), "power of 2"); + degree + } + + #[inline] + pub fn split_transposed_vectors<'v>( + &self, + vectors: &'v mut TransposedVectors, + ) -> (TransposedSplit<'v>, TransposedSplit<'v>) { + unsafe { vectors.split(self.degree()) } + } + + #[inline] + fn compress_fn(&self) -> CompressFn { + unsafe { mem::transmute(self.compress_ptr.load(Relaxed)) } + } + + #[inline] + pub fn compress( + &self, + block: &BlockBytes, + block_len: u32, + cv: &CVBytes, + counter: u64, + flags: u32, + ) -> CVBytes { + let mut out = [0u8; 32]; + unsafe { + self.compress_fn()(block, block_len, cv, counter, flags, &mut out); + } + out + } + + // The contract for HashChunksFn doesn't require the implementation to support single-chunk + // inputs. Instead we handle that case here by calling compress in a loop. + #[inline] + fn hash_one_chunk( + &self, + mut input: &[u8], + key: &CVBytes, + counter: u64, + mut flags: u32, + output: TransposedSplit, + ) { + debug_assert!(input.len() <= CHUNK_LEN); + let mut cv = *key; + flags |= CHUNK_START; + while input.len() > BLOCK_LEN { + cv = self.compress( + input[..BLOCK_LEN].try_into().unwrap(), + BLOCK_LEN as u32, + &cv, + counter, + flags, + ); + input = &input[BLOCK_LEN..]; + flags &= !CHUNK_START; + } + let mut final_block = [0u8; BLOCK_LEN]; + final_block[..input.len()].copy_from_slice(input); + cv = self.compress( + &final_block, + input.len() as u32, + &cv, + counter, + flags | CHUNK_END, + ); + unsafe { + write_transposed_cv(&words_from_le_bytes_32(&cv), output.ptr); + } + } + + #[inline] + fn hash_chunks_fn(&self) -> HashChunksFn { + unsafe { mem::transmute(self.hash_chunks_ptr.load(Relaxed)) } + } + + #[inline] + pub fn hash_chunks( + &self, + input: &[u8], + key: &CVBytes, + counter: u64, + flags: u32, + transposed_output: TransposedSplit, + ) -> usize { + debug_assert!(input.len() <= self.degree() * CHUNK_LEN); + if input.len() <= CHUNK_LEN { + // The underlying hash_chunks_fn isn't required to support this case. Instead we handle + // it by calling compress_fn in a loop. But note that we still don't support root + // finalization or the empty input here. + self.hash_one_chunk(input, key, counter, flags, transposed_output); + return 1; + } + // SAFETY: If the caller passes in more than MAX_SIMD_DEGREE * CHUNK_LEN bytes, silently + // ignore the remainder. This makes it impossible to write out of bounds in a properly + // constructed TransposedSplit. + let len = cmp::min(input.len(), MAX_SIMD_DEGREE * CHUNK_LEN); + unsafe { + self.hash_chunks_fn()( + input.as_ptr(), + len, + key, + counter, + flags, + transposed_output.ptr, + ); + } + if input.len() % CHUNK_LEN == 0 { + input.len() / CHUNK_LEN + } else { + (input.len() / CHUNK_LEN) + 1 + } + } + + #[inline] + fn hash_parents_fn(&self) -> HashParentsFn { + unsafe { mem::transmute(self.hash_parents_ptr.load(Relaxed)) } + } + + #[inline] + pub fn hash_parents( + &self, + transposed_input: &TransposedVectors, + mut num_cvs: usize, + key: &CVBytes, + flags: u32, + transposed_output: TransposedSplit, + ) -> usize { + debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE); + // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses. + num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE); + let mut odd_cv = [0u32; 8]; + if num_cvs % 2 == 1 { + unsafe { + odd_cv = read_transposed_cv(transposed_input.as_ptr().add(num_cvs - 1)); + } + } + let num_parents = num_cvs / 2; + unsafe { + self.hash_parents_fn()( + transposed_input.as_ptr(), + num_parents, + key, + flags | PARENT, + transposed_output.ptr, + ); + } + if num_cvs % 2 == 1 { + unsafe { + write_transposed_cv(&odd_cv, transposed_output.ptr.add(num_parents)); + } + num_parents + 1 + } else { + num_parents + } + } + + #[inline] + pub fn reduce_parents( + &self, + transposed_in_out: &mut TransposedVectors, + mut num_cvs: usize, + key: &CVBytes, + flags: u32, + ) -> usize { + debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE); + // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses. + num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE); + let in_out_ptr = transposed_in_out.as_mut_ptr(); + let mut odd_cv = [0u32; 8]; + if num_cvs % 2 == 1 { + unsafe { + odd_cv = read_transposed_cv(in_out_ptr.add(num_cvs - 1)); + } + } + let num_parents = num_cvs / 2; + unsafe { + self.hash_parents_fn()(in_out_ptr, num_parents, key, flags | PARENT, in_out_ptr); + } + if num_cvs % 2 == 1 { + unsafe { + write_transposed_cv(&odd_cv, in_out_ptr.add(num_parents)); + } + num_parents + 1 + } else { + num_parents + } + } + + #[inline] + fn xof_fn(&self) -> XofFn { + unsafe { mem::transmute(self.xof_ptr.load(Relaxed)) } + } + + #[inline] + pub fn xof( + &self, + block: &BlockBytes, + block_len: u32, + cv: &CVBytes, + mut counter: u64, + flags: u32, + mut out: &mut [u8], + ) { + let degree = self.degree(); + let simd_len = degree * BLOCK_LEN; + while !out.is_empty() { + let take = cmp::min(simd_len, out.len()); + unsafe { + self.xof_fn()( + block, + block_len, + cv, + counter, + flags | ROOT, + out.as_mut_ptr(), + take, + ); + } + out = &mut out[take..]; + counter += degree as u64; + } + } + + #[inline] + fn xof_xor_fn(&self) -> XofFn { + unsafe { mem::transmute(self.xof_xor_ptr.load(Relaxed)) } + } + + #[inline] + pub fn xof_xor( + &self, + block: &BlockBytes, + block_len: u32, + cv: &CVBytes, + mut counter: u64, + flags: u32, + mut out: &mut [u8], + ) { + let degree = self.degree(); + let simd_len = degree * BLOCK_LEN; + while !out.is_empty() { + let take = cmp::min(simd_len, out.len()); + unsafe { + self.xof_xor_fn()( + block, + block_len, + cv, + counter, + flags | ROOT, + out.as_mut_ptr(), + take, + ); + } + out = &mut out[take..]; + counter += degree as u64; + } + } + + #[inline] + fn universal_hash_fn(&self) -> UniversalHashFn { + unsafe { mem::transmute(self.universal_hash_ptr.load(Relaxed)) } + } + + #[inline] + pub fn universal_hash(&self, mut input: &[u8], key: &CVBytes, mut counter: u64) -> [u8; 16] { + let degree = self.degree(); + let simd_len = degree * BLOCK_LEN; + let mut ret = [0u8; 16]; + while !input.is_empty() { + let take = cmp::min(simd_len, input.len()); + let mut output = [0u8; 16]; + unsafe { + self.universal_hash_fn()(input.as_ptr(), take, key, counter, &mut output); + } + input = &input[take..]; + counter += degree as u64; + for byte_index in 0..16 { + ret[byte_index] ^= output[byte_index]; + } + } + ret + } +} + +impl Clone for Implementation { + fn clone(&self) -> Self { + Self { + degree_ptr: AtomicPtr::new(self.degree_ptr.load(Relaxed)), + compress_ptr: AtomicPtr::new(self.compress_ptr.load(Relaxed)), + hash_chunks_ptr: AtomicPtr::new(self.hash_chunks_ptr.load(Relaxed)), + hash_parents_ptr: AtomicPtr::new(self.hash_parents_ptr.load(Relaxed)), + xof_ptr: AtomicPtr::new(self.xof_ptr.load(Relaxed)), + xof_xor_ptr: AtomicPtr::new(self.xof_xor_ptr.load(Relaxed)), + universal_hash_ptr: AtomicPtr::new(self.universal_hash_ptr.load(Relaxed)), + } + } +} + +// never less than 2 +type DegreeFn = unsafe extern "C" fn() -> usize; + +unsafe extern "C" fn degree_init() -> usize { + init_detected_impl(); + DETECTED_IMPL.degree_fn()() +} + +type CompressFn = unsafe extern "C" fn( + block: *const BlockBytes, // zero padded to 64 bytes + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut CVBytes, // may overlap the input +); + +unsafe extern "C" fn compress_init( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut CVBytes, +) { + init_detected_impl(); + DETECTED_IMPL.compress_fn()(block, block_len, cv, counter, flags, out); +} + +type CompressXofFn = unsafe extern "C" fn( + block: *const BlockBytes, // zero padded to 64 bytes + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut BlockBytes, // may overlap the input +); + +type HashChunksFn = unsafe extern "C" fn( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + flags: u32, + transposed_output: *mut u32, +); + +unsafe extern "C" fn hash_chunks_init( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + flags: u32, + transposed_output: *mut u32, +) { + init_detected_impl(); + DETECTED_IMPL.hash_chunks_fn()(input, input_len, key, counter, flags, transposed_output); +} + +type HashParentsFn = unsafe extern "C" fn( + transposed_input: *const u32, + num_parents: usize, + key: *const CVBytes, + flags: u32, + transposed_output: *mut u32, // may overlap the input +); + +unsafe extern "C" fn hash_parents_init( + transposed_input: *const u32, + num_parents: usize, + key: *const CVBytes, + flags: u32, + transposed_output: *mut u32, +) { + init_detected_impl(); + DETECTED_IMPL.hash_parents_fn()(transposed_input, num_parents, key, flags, transposed_output); +} + +// This signature covers both xof() and xof_xor(). +type XofFn = unsafe extern "C" fn( + block: *const BlockBytes, // zero padded to 64 bytes + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +); + +unsafe extern "C" fn xof_init( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +) { + init_detected_impl(); + DETECTED_IMPL.xof_fn()(block, block_len, cv, counter, flags, out, out_len); +} + +unsafe extern "C" fn xof_xor_init( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +) { + init_detected_impl(); + DETECTED_IMPL.xof_xor_fn()(block, block_len, cv, counter, flags, out, out_len); +} + +type UniversalHashFn = unsafe extern "C" fn( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + out: *mut [u8; 16], +); + +unsafe extern "C" fn universal_hash_init( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + out: *mut [u8; 16], +) { + init_detected_impl(); + DETECTED_IMPL.universal_hash_fn()(input, input_len, key, counter, out); +} + +// The implicit degree of this implementation is MAX_SIMD_DEGREE. +#[inline(always)] +unsafe fn hash_chunks_using_compress( + compress: CompressFn, + mut input: *const u8, + mut input_len: usize, + key: *const CVBytes, + mut counter: u64, + flags: u32, + mut transposed_output: *mut u32, +) { + debug_assert!(input_len > 0); + debug_assert!(input_len <= MAX_SIMD_DEGREE * CHUNK_LEN); + input_len = cmp::min(input_len, MAX_SIMD_DEGREE * CHUNK_LEN); + while input_len > 0 { + let mut chunk_len = cmp::min(input_len, CHUNK_LEN); + input_len -= chunk_len; + // We only use 8 words of the CV, but compress returns 16. + let mut cv = *key; + let cv_ptr: *mut CVBytes = &mut cv; + let mut chunk_flags = flags | CHUNK_START; + while chunk_len > BLOCK_LEN { + compress( + input as *const BlockBytes, + BLOCK_LEN as u32, + cv_ptr, + counter, + chunk_flags, + cv_ptr, + ); + input = input.add(BLOCK_LEN); + chunk_len -= BLOCK_LEN; + chunk_flags &= !CHUNK_START; + } + let mut last_block = [0u8; BLOCK_LEN]; + ptr::copy_nonoverlapping(input, last_block.as_mut_ptr(), chunk_len); + input = input.add(chunk_len); + compress( + &last_block, + chunk_len as u32, + cv_ptr, + counter, + chunk_flags | CHUNK_END, + cv_ptr, + ); + let cv_words = words_from_le_bytes_32(&cv); + for word_index in 0..8 { + transposed_output + .add(word_index * TRANSPOSED_STRIDE) + .write(cv_words[word_index]); + } + transposed_output = transposed_output.add(1); + counter += 1; + } +} + +// The implicit degree of this implementation is MAX_SIMD_DEGREE. +#[inline(always)] +unsafe fn hash_parents_using_compress( + compress: CompressFn, + mut transposed_input: *const u32, + mut num_parents: usize, + key: *const CVBytes, + flags: u32, + mut transposed_output: *mut u32, // may overlap the input +) { + debug_assert!(num_parents > 0); + debug_assert!(num_parents <= MAX_SIMD_DEGREE); + while num_parents > 0 { + let mut block_bytes = [0u8; 64]; + for word_index in 0..8 { + let left_child_word = transposed_input.add(word_index * TRANSPOSED_STRIDE).read(); + block_bytes[WORD_LEN * word_index..][..WORD_LEN] + .copy_from_slice(&left_child_word.to_le_bytes()); + let right_child_word = transposed_input + .add(word_index * TRANSPOSED_STRIDE + 1) + .read(); + block_bytes[WORD_LEN * (word_index + 8)..][..WORD_LEN] + .copy_from_slice(&right_child_word.to_le_bytes()); + } + let mut cv = [0u8; 32]; + compress(&block_bytes, BLOCK_LEN as u32, key, 0, flags, &mut cv); + let cv_words = words_from_le_bytes_32(&cv); + for word_index in 0..8 { + transposed_output + .add(word_index * TRANSPOSED_STRIDE) + .write(cv_words[word_index]); + } + transposed_input = transposed_input.add(2); + transposed_output = transposed_output.add(1); + num_parents -= 1; + } +} + +#[inline(always)] +unsafe fn xof_using_compress_xof( + compress_xof: CompressXofFn, + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + mut counter: u64, + flags: u32, + mut out: *mut u8, + mut out_len: usize, +) { + debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN); + while out_len > 0 { + let mut block_output = [0u8; 64]; + compress_xof(block, block_len, cv, counter, flags, &mut block_output); + let take = cmp::min(out_len, BLOCK_LEN); + ptr::copy_nonoverlapping(block_output.as_ptr(), out, take); + out = out.add(take); + out_len -= take; + counter += 1; + } +} + +#[inline(always)] +unsafe fn xof_xor_using_compress_xof( + compress_xof: CompressXofFn, + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + mut counter: u64, + flags: u32, + mut out: *mut u8, + mut out_len: usize, +) { + debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN); + while out_len > 0 { + let mut block_output = [0u8; 64]; + compress_xof(block, block_len, cv, counter, flags, &mut block_output); + let take = cmp::min(out_len, BLOCK_LEN); + for i in 0..take { + *out.add(i) ^= block_output[i]; + } + out = out.add(take); + out_len -= take; + counter += 1; + } +} + +#[inline(always)] +unsafe fn universal_hash_using_compress( + compress: CompressFn, + mut input: *const u8, + mut input_len: usize, + key: *const CVBytes, + mut counter: u64, + out: *mut [u8; 16], +) { + let flags = KEYED_HASH | CHUNK_START | CHUNK_END | ROOT; + let mut result = [0u8; 16]; + while input_len > 0 { + let block_len = cmp::min(input_len, BLOCK_LEN); + let mut block = [0u8; BLOCK_LEN]; + ptr::copy_nonoverlapping(input, block.as_mut_ptr(), block_len); + let mut block_output = [0u8; 32]; + compress( + &block, + block_len as u32, + key, + counter, + flags, + &mut block_output, + ); + for i in 0..16 { + result[i] ^= block_output[i]; + } + input = input.add(block_len); + input_len -= block_len; + counter += 1; + } + *out = result; +} + +// this is in units of *words*, for pointer operations on *const/*mut u32 +const TRANSPOSED_STRIDE: usize = 2 * MAX_SIMD_DEGREE; + +#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), repr(C, align(64)))] +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct TransposedVectors([[u32; 2 * MAX_SIMD_DEGREE]; 8]); + +impl TransposedVectors { + pub fn new() -> Self { + Self([[0; 2 * MAX_SIMD_DEGREE]; 8]) + } + + pub fn extract_cv(&self, cv_index: usize) -> CVBytes { + let mut words = [0u32; 8]; + for word_index in 0..8 { + words[word_index] = self.0[word_index][cv_index]; + } + le_bytes_from_words_32(&words) + } + + pub fn extract_parent_node(&self, parent_index: usize) -> BlockBytes { + let mut bytes = [0u8; 64]; + bytes[..32].copy_from_slice(&self.extract_cv(parent_index / 2)); + bytes[32..].copy_from_slice(&self.extract_cv(parent_index / 2 + 1)); + bytes + } + + fn as_ptr(&self) -> *const u32 { + self.0[0].as_ptr() + } + + fn as_mut_ptr(&mut self) -> *mut u32 { + self.0[0].as_mut_ptr() + } + + // SAFETY: This function is just pointer arithmetic, but callers assume that it's safe (not + // necessarily correct) to write up to `degree` words to either side of the split, possibly + // from different threads. + unsafe fn split(&mut self, degree: usize) -> (TransposedSplit, TransposedSplit) { + debug_assert!(degree > 0); + debug_assert!(degree <= MAX_SIMD_DEGREE); + debug_assert_eq!(degree.count_ones(), 1, "power of 2"); + let ptr = self.as_mut_ptr(); + let left = TransposedSplit { + ptr, + phantom_data: PhantomData, + }; + let right = TransposedSplit { + ptr: ptr.wrapping_add(degree), + phantom_data: PhantomData, + }; + (left, right) + } +} + +pub struct TransposedSplit<'vectors> { + ptr: *mut u32, + phantom_data: PhantomData<&'vectors mut u32>, +} + +unsafe impl<'vectors> Send for TransposedSplit<'vectors> {} +unsafe impl<'vectors> Sync for TransposedSplit<'vectors> {} + +unsafe fn read_transposed_cv(src: *const u32) -> CVWords { + let mut cv = [0u32; 8]; + for word_index in 0..8 { + let offset_words = word_index * TRANSPOSED_STRIDE; + cv[word_index] = src.add(offset_words).read(); + } + cv +} + +unsafe fn write_transposed_cv(cv: &CVWords, dest: *mut u32) { + for word_index in 0..8 { + let offset_words = word_index * TRANSPOSED_STRIDE; + dest.add(offset_words).write(cv[word_index]); + } +} + +#[inline(always)] +pub const fn le_bytes_from_words_32(words: &CVWords) -> CVBytes { + let mut bytes = [0u8; 32]; + // This loop is super verbose because currently that's what it takes to be const. + let mut word_index = 0; + while word_index < bytes.len() / WORD_LEN { + let word_bytes = words[word_index].to_le_bytes(); + let mut byte_index = 0; + while byte_index < WORD_LEN { + bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index]; + byte_index += 1; + } + word_index += 1; + } + bytes +} + +#[inline(always)] +pub const fn le_bytes_from_words_64(words: &BlockWords) -> BlockBytes { + let mut bytes = [0u8; 64]; + // This loop is super verbose because currently that's what it takes to be const. + let mut word_index = 0; + while word_index < bytes.len() / WORD_LEN { + let word_bytes = words[word_index].to_le_bytes(); + let mut byte_index = 0; + while byte_index < WORD_LEN { + bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index]; + byte_index += 1; + } + word_index += 1; + } + bytes +} + +#[inline(always)] +pub const fn words_from_le_bytes_32(bytes: &CVBytes) -> CVWords { + let mut words = [0u32; 8]; + // This loop is super verbose because currently that's what it takes to be const. + let mut word_index = 0; + while word_index < words.len() { + let mut word_bytes = [0u8; WORD_LEN]; + let mut byte_index = 0; + while byte_index < WORD_LEN { + word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index]; + byte_index += 1; + } + words[word_index] = u32::from_le_bytes(word_bytes); + word_index += 1; + } + words +} + +#[inline(always)] +pub const fn words_from_le_bytes_64(bytes: &BlockBytes) -> BlockWords { + let mut words = [0u32; 16]; + // This loop is super verbose because currently that's what it takes to be const. + let mut word_index = 0; + while word_index < words.len() { + let mut word_bytes = [0u8; WORD_LEN]; + let mut byte_index = 0; + while byte_index < WORD_LEN { + word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index]; + byte_index += 1; + } + words[word_index] = u32::from_le_bytes(word_bytes); + word_index += 1; + } + words +} + +#[test] +fn test_byte_word_round_trips() { + let cv = *b"This is 32 LE bytes/eight words."; + assert_eq!(cv, le_bytes_from_words_32(&words_from_le_bytes_32(&cv))); + let block = *b"This is sixty-four little-endian bytes, or sixteen 32-bit words."; + assert_eq!( + block, + le_bytes_from_words_64(&words_from_le_bytes_64(&block)), + ); +} + +// The largest power of two less than or equal to `n`, used for left_len() +// immediately below, and also directly in Hasher::update(). +pub fn largest_power_of_two_leq(n: usize) -> usize { + ((n / 2) + 1).next_power_of_two() +} + +#[test] +fn test_largest_power_of_two_leq() { + let input_output = &[ + // The zero case is nonsensical, but it does work. + (0, 1), + (1, 1), + (2, 2), + (3, 2), + (4, 4), + (5, 4), + (6, 4), + (7, 4), + (8, 8), + // the largest possible usize + (usize::MAX, (usize::MAX >> 1) + 1), + ]; + for &(input, output) in input_output { + assert_eq!( + output, + crate::largest_power_of_two_leq(input), + "wrong output for n={}", + input + ); + } +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +pub fn left_len(content_len: usize) -> usize { + debug_assert!(content_len > CHUNK_LEN); + // Subtract 1 to reserve at least one byte for the right side. + let full_chunks = (content_len - 1) / CHUNK_LEN; + largest_power_of_two_leq(full_chunks) * CHUNK_LEN +} + +#[test] +fn test_left_len() { + let input_output = &[ + (CHUNK_LEN + 1, CHUNK_LEN), + (2 * CHUNK_LEN - 1, CHUNK_LEN), + (2 * CHUNK_LEN, CHUNK_LEN), + (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN), + (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN), + (4 * CHUNK_LEN, 2 * CHUNK_LEN), + (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN), + ]; + for &(input, output) in input_output { + assert_eq!(left_len(input), output); + } +} diff --git a/third-party/blake3/rust/guts/src/portable.rs b/third-party/blake3/rust/guts/src/portable.rs new file mode 100644 index 00000000..d5976440 --- /dev/null +++ b/third-party/blake3/rust/guts/src/portable.rs @@ -0,0 +1,262 @@ +use crate::{ + le_bytes_from_words_32, le_bytes_from_words_64, words_from_le_bytes_32, words_from_le_bytes_64, + BlockBytes, BlockWords, CVBytes, CVWords, Implementation, IV, MAX_SIMD_DEGREE, MSG_SCHEDULE, +}; + +const DEGREE: usize = MAX_SIMD_DEGREE; + +unsafe extern "C" fn degree() -> usize { + DEGREE +} + +#[inline(always)] +fn g(state: &mut BlockWords, a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { + state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); + state[d] = (state[d] ^ state[a]).rotate_right(16); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(12); + state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); + state[d] = (state[d] ^ state[a]).rotate_right(8); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(7); +} + +#[inline(always)] +fn round(state: &mut [u32; 16], msg: &BlockWords, round: usize) { + // Select the message schedule based on the round. + let schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the diagonals. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +#[inline(always)] +fn compress_inner( + block_words: &BlockWords, + block_len: u32, + cv_words: &CVWords, + counter: u64, + flags: u32, +) -> [u32; 16] { + let mut state = [ + cv_words[0], + cv_words[1], + cv_words[2], + cv_words[3], + cv_words[4], + cv_words[5], + cv_words[6], + cv_words[7], + IV[0], + IV[1], + IV[2], + IV[3], + counter as u32, + (counter >> 32) as u32, + block_len as u32, + flags as u32, + ]; + for round_number in 0..7 { + round(&mut state, &block_words, round_number); + } + state +} + +pub(crate) unsafe extern "C" fn compress( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut CVBytes, +) { + let block_words = words_from_le_bytes_64(&*block); + let cv_words = words_from_le_bytes_32(&*cv); + let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags); + for word_index in 0..8 { + state[word_index] ^= state[word_index + 8]; + } + *out = le_bytes_from_words_32(state[..8].try_into().unwrap()); +} + +pub(crate) unsafe extern "C" fn compress_xof( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut BlockBytes, +) { + let block_words = words_from_le_bytes_64(&*block); + let cv_words = words_from_le_bytes_32(&*cv); + let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags); + for word_index in 0..8 { + state[word_index] ^= state[word_index + 8]; + state[word_index + 8] ^= cv_words[word_index]; + } + *out = le_bytes_from_words_64(&state); +} + +pub(crate) unsafe extern "C" fn hash_chunks( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + flags: u32, + transposed_output: *mut u32, +) { + crate::hash_chunks_using_compress( + compress, + input, + input_len, + key, + counter, + flags, + transposed_output, + ) +} + +pub(crate) unsafe extern "C" fn hash_parents( + transposed_input: *const u32, + num_parents: usize, + key: *const CVBytes, + flags: u32, + transposed_output: *mut u32, // may overlap the input +) { + crate::hash_parents_using_compress( + compress, + transposed_input, + num_parents, + key, + flags, + transposed_output, + ) +} + +pub(crate) unsafe extern "C" fn xof( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +) { + crate::xof_using_compress_xof( + compress_xof, + block, + block_len, + cv, + counter, + flags, + out, + out_len, + ) +} + +pub(crate) unsafe extern "C" fn xof_xor( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +) { + crate::xof_xor_using_compress_xof( + compress_xof, + block, + block_len, + cv, + counter, + flags, + out, + out_len, + ) +} + +pub(crate) unsafe extern "C" fn universal_hash( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + out: *mut [u8; 16], +) { + crate::universal_hash_using_compress(compress, input, input_len, key, counter, out) +} + +pub fn implementation() -> Implementation { + Implementation::new( + degree, + compress, + hash_chunks, + hash_parents, + xof, + xof_xor, + universal_hash, + ) +} + +#[cfg(test)] +mod test { + use super::*; + + // This is circular but do it anyway. + #[test] + fn test_compress_vs_portable() { + crate::test::test_compress_vs_portable(&implementation()); + } + + #[test] + fn test_compress_vs_reference() { + crate::test::test_compress_vs_reference(&implementation()); + } + + // This is circular but do it anyway. + #[test] + fn test_hash_chunks_vs_portable() { + crate::test::test_hash_chunks_vs_portable(&implementation()); + } + + // This is circular but do it anyway. + #[test] + fn test_hash_parents_vs_portable() { + crate::test::test_hash_parents_vs_portable(&implementation()); + } + + #[test] + fn test_chunks_and_parents_vs_reference() { + crate::test::test_chunks_and_parents_vs_reference(&implementation()); + } + + // This is circular but do it anyway. + #[test] + fn test_xof_vs_portable() { + crate::test::test_xof_vs_portable(&implementation()); + } + + #[test] + fn test_xof_vs_reference() { + crate::test::test_xof_vs_reference(&implementation()); + } + + // This is circular but do it anyway. + #[test] + fn test_universal_hash_vs_portable() { + crate::test::test_universal_hash_vs_portable(&implementation()); + } + + #[test] + fn test_universal_hash_vs_reference() { + crate::test::test_universal_hash_vs_reference(&implementation()); + } +} diff --git a/third-party/blake3/rust/guts/src/test.rs b/third-party/blake3/rust/guts/src/test.rs new file mode 100644 index 00000000..83bd790c --- /dev/null +++ b/third-party/blake3/rust/guts/src/test.rs @@ -0,0 +1,523 @@ +use crate::*; + +pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; + +// Test a few different initial counter values. +// - 0: The base case. +// - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR when +// you're supposed to ANDNOT. +// - u32::MAX: The low word of the counter overflows for all inputs except the first. +// - (42 << 32) + u32::MAX: Same but with a non-zero value in the high word. +const INITIAL_COUNTERS: [u64; 4] = [ + 0, + i32::MAX as u64, + u32::MAX as u64, + (42u64 << 32) + u32::MAX as u64, +]; + +const BLOCK_LENGTHS: [usize; 4] = [0, 1, 63, 64]; + +pub fn paint_test_input(buf: &mut [u8]) { + for (i, b) in buf.iter_mut().enumerate() { + *b = (i % 251) as u8; + } +} + +pub fn test_compress_vs_portable(test_impl: &Implementation) { + for block_len in BLOCK_LENGTHS { + dbg!(block_len); + let mut block = [0; BLOCK_LEN]; + paint_test_input(&mut block[..block_len]); + for counter in INITIAL_COUNTERS { + dbg!(counter); + let portable_cv = portable::implementation().compress( + &block, + block_len as u32, + &TEST_KEY, + counter, + KEYED_HASH, + ); + + let test_cv = + test_impl.compress(&block, block_len as u32, &TEST_KEY, counter, KEYED_HASH); + + assert_eq!(portable_cv, test_cv); + } + } +} + +pub fn test_compress_vs_reference(test_impl: &Implementation) { + for block_len in BLOCK_LENGTHS { + dbg!(block_len); + let mut block = [0; BLOCK_LEN]; + paint_test_input(&mut block[..block_len]); + + let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + ref_hasher.update(&block[..block_len]); + let mut ref_hash = [0u8; 32]; + ref_hasher.finalize(&mut ref_hash); + + let test_cv = test_impl.compress( + &block, + block_len as u32, + &TEST_KEY, + 0, + CHUNK_START | CHUNK_END | ROOT | KEYED_HASH, + ); + + assert_eq!(ref_hash, test_cv); + } +} + +fn check_transposed_eq(output_a: &TransposedVectors, output_b: &TransposedVectors) { + if output_a == output_b { + return; + } + for cv_index in 0..2 * MAX_SIMD_DEGREE { + let cv_a = output_a.extract_cv(cv_index); + let cv_b = output_b.extract_cv(cv_index); + if cv_a == [0; 32] && cv_b == [0; 32] { + println!("CV {cv_index:2} empty"); + } else if cv_a == cv_b { + println!("CV {cv_index:2} matches"); + } else { + println!("CV {cv_index:2} mismatch:"); + println!(" {}", hex::encode(cv_a)); + println!(" {}", hex::encode(cv_b)); + } + } + panic!("transposed outputs are not equal"); +} + +pub fn test_hash_chunks_vs_portable(test_impl: &Implementation) { + assert!(test_impl.degree() <= MAX_SIMD_DEGREE); + dbg!(test_impl.degree() * CHUNK_LEN); + // Allocate 4 extra bytes of padding so we can make aligned slices. + let mut input_buf = [0u8; 2 * 2 * MAX_SIMD_DEGREE * CHUNK_LEN + 4]; + let mut input_slice = &mut input_buf[..]; + // Make sure the start of the input is word-aligned. + while input_slice.as_ptr() as usize % 4 != 0 { + input_slice = &mut input_slice[1..]; + } + let (aligned_input, mut unaligned_input) = + input_slice.split_at_mut(2 * MAX_SIMD_DEGREE * CHUNK_LEN); + unaligned_input = &mut unaligned_input[1..][..2 * MAX_SIMD_DEGREE * CHUNK_LEN]; + assert_eq!(aligned_input.as_ptr() as usize % 4, 0); + assert_eq!(unaligned_input.as_ptr() as usize % 4, 1); + paint_test_input(aligned_input); + paint_test_input(unaligned_input); + // Try just below, equal to, and just above every whole number of chunks. + let mut input_2_lengths = Vec::new(); + let mut next_len = 2 * CHUNK_LEN; + loop { + // 95 is one whole block plus one interesting part of another + input_2_lengths.push(next_len - 95); + input_2_lengths.push(next_len); + if next_len == test_impl.degree() * CHUNK_LEN { + break; + } + input_2_lengths.push(next_len + 95); + next_len += CHUNK_LEN; + } + for input_2_len in input_2_lengths { + dbg!(input_2_len); + let aligned_input1 = &aligned_input[..test_impl.degree() * CHUNK_LEN]; + let aligned_input2 = &aligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len]; + let unaligned_input1 = &unaligned_input[..test_impl.degree() * CHUNK_LEN]; + let unaligned_input2 = &unaligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len]; + for initial_counter in INITIAL_COUNTERS { + dbg!(initial_counter); + // Make two calls, to test the output_column parameter. + let mut portable_output = TransposedVectors::new(); + let (portable_left, portable_right) = + test_impl.split_transposed_vectors(&mut portable_output); + portable::implementation().hash_chunks( + aligned_input1, + &IV_BYTES, + initial_counter, + 0, + portable_left, + ); + portable::implementation().hash_chunks( + aligned_input2, + &TEST_KEY, + initial_counter + test_impl.degree() as u64, + KEYED_HASH, + portable_right, + ); + + let mut test_output = TransposedVectors::new(); + let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output); + test_impl.hash_chunks(aligned_input1, &IV_BYTES, initial_counter, 0, test_left); + test_impl.hash_chunks( + aligned_input2, + &TEST_KEY, + initial_counter + test_impl.degree() as u64, + KEYED_HASH, + test_right, + ); + check_transposed_eq(&portable_output, &test_output); + + // Do the same thing with unaligned input. + let mut unaligned_test_output = TransposedVectors::new(); + let (unaligned_left, unaligned_right) = + test_impl.split_transposed_vectors(&mut unaligned_test_output); + test_impl.hash_chunks( + unaligned_input1, + &IV_BYTES, + initial_counter, + 0, + unaligned_left, + ); + test_impl.hash_chunks( + unaligned_input2, + &TEST_KEY, + initial_counter + test_impl.degree() as u64, + KEYED_HASH, + unaligned_right, + ); + check_transposed_eq(&portable_output, &unaligned_test_output); + } + } +} + +fn painted_transposed_input() -> TransposedVectors { + let mut vectors = TransposedVectors::new(); + let mut val = 0; + for col in 0..2 * MAX_SIMD_DEGREE { + for row in 0..8 { + vectors.0[row][col] = val; + val += 1; + } + } + vectors +} + +pub fn test_hash_parents_vs_portable(test_impl: &Implementation) { + assert!(test_impl.degree() <= MAX_SIMD_DEGREE); + let input = painted_transposed_input(); + for num_parents in 2..=(test_impl.degree() / 2) { + dbg!(num_parents); + let mut portable_output = TransposedVectors::new(); + let (portable_left, portable_right) = + test_impl.split_transposed_vectors(&mut portable_output); + portable::implementation().hash_parents( + &input, + 2 * num_parents, // num_cvs + &IV_BYTES, + 0, + portable_left, + ); + portable::implementation().hash_parents( + &input, + 2 * num_parents, // num_cvs + &TEST_KEY, + KEYED_HASH, + portable_right, + ); + + let mut test_output = TransposedVectors::new(); + let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output); + test_impl.hash_parents( + &input, + 2 * num_parents, // num_cvs + &IV_BYTES, + 0, + test_left, + ); + test_impl.hash_parents( + &input, + 2 * num_parents, // num_cvs + &TEST_KEY, + KEYED_HASH, + test_right, + ); + + check_transposed_eq(&portable_output, &test_output); + } +} + +fn hash_with_chunks_and_parents_recurse( + test_impl: &Implementation, + input: &[u8], + counter: u64, + output: TransposedSplit, +) -> usize { + assert!(input.len() > 0); + if input.len() <= test_impl.degree() * CHUNK_LEN { + return test_impl.hash_chunks(input, &IV_BYTES, counter, 0, output); + } + let (left_input, right_input) = input.split_at(left_len(input.len())); + let mut child_output = TransposedVectors::new(); + let (left_output, right_output) = test_impl.split_transposed_vectors(&mut child_output); + let mut children = + hash_with_chunks_and_parents_recurse(test_impl, left_input, counter, left_output); + assert_eq!(children, test_impl.degree()); + children += hash_with_chunks_and_parents_recurse( + test_impl, + right_input, + counter + (left_input.len() / CHUNK_LEN) as u64, + right_output, + ); + test_impl.hash_parents(&child_output, children, &IV_BYTES, PARENT, output) +} + +// Note: This test implementation doesn't support the 1-chunk-or-less case. +fn root_hash_with_chunks_and_parents(test_impl: &Implementation, input: &[u8]) -> CVBytes { + // TODO: handle the 1-chunk case? + assert!(input.len() > CHUNK_LEN); + let mut cvs = TransposedVectors::new(); + // The right half of these vectors are never used. + let (cvs_left, _) = test_impl.split_transposed_vectors(&mut cvs); + let mut num_cvs = hash_with_chunks_and_parents_recurse(test_impl, input, 0, cvs_left); + while num_cvs > 2 { + num_cvs = test_impl.reduce_parents(&mut cvs, num_cvs, &IV_BYTES, 0); + } + test_impl.compress( + &cvs.extract_parent_node(0), + BLOCK_LEN as u32, + &IV_BYTES, + 0, + PARENT | ROOT, + ) +} + +pub fn test_chunks_and_parents_vs_reference(test_impl: &Implementation) { + assert_eq!(test_impl.degree().count_ones(), 1, "power of 2"); + const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * CHUNK_LEN; + let mut input_buf = [0u8; MAX_INPUT_LEN]; + paint_test_input(&mut input_buf); + // Try just below, equal to, and just above every whole number of chunks, except that + // root_hash_with_chunks_and_parents doesn't support the 1-chunk-or-less case. + let mut test_lengths = vec![CHUNK_LEN + 1]; + let mut next_len = 2 * CHUNK_LEN; + loop { + // 95 is one whole block plus one interesting part of another + test_lengths.push(next_len - 95); + test_lengths.push(next_len); + if next_len == MAX_INPUT_LEN { + break; + } + test_lengths.push(next_len + 95); + next_len += CHUNK_LEN; + } + for test_len in test_lengths { + dbg!(test_len); + let input = &input_buf[..test_len]; + + let mut ref_hasher = reference_impl::Hasher::new(); + ref_hasher.update(&input); + let mut ref_hash = [0u8; 32]; + ref_hasher.finalize(&mut ref_hash); + + let test_hash = root_hash_with_chunks_and_parents(test_impl, input); + + assert_eq!(ref_hash, test_hash); + } +} + +pub fn test_xof_vs_portable(test_impl: &Implementation) { + let flags = CHUNK_START | CHUNK_END | KEYED_HASH; + for counter in INITIAL_COUNTERS { + dbg!(counter); + for input_len in [0, 1, BLOCK_LEN] { + dbg!(input_len); + let mut input_block = [0u8; BLOCK_LEN]; + for byte_index in 0..input_len { + input_block[byte_index] = byte_index as u8 + 42; + } + // Try equal to and partway through every whole number of output blocks. + const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; + let mut output_lengths = Vec::new(); + let mut next_len = 0; + loop { + output_lengths.push(next_len); + if next_len == MAX_OUTPUT_LEN { + break; + } + output_lengths.push(next_len + 31); + next_len += BLOCK_LEN; + } + for output_len in output_lengths { + dbg!(output_len); + let mut portable_output = [0xff; MAX_OUTPUT_LEN]; + portable::implementation().xof( + &input_block, + input_len as u32, + &TEST_KEY, + counter, + flags, + &mut portable_output[..output_len], + ); + let mut test_output = [0xff; MAX_OUTPUT_LEN]; + test_impl.xof( + &input_block, + input_len as u32, + &TEST_KEY, + counter, + flags, + &mut test_output[..output_len], + ); + assert_eq!(portable_output, test_output); + + // Double check that the implementation didn't overwrite. + assert!(test_output[output_len..].iter().all(|&b| b == 0xff)); + + // The first XOR cancels out the output. + test_impl.xof_xor( + &input_block, + input_len as u32, + &TEST_KEY, + counter, + flags, + &mut test_output[..output_len], + ); + assert!(test_output[..output_len].iter().all(|&b| b == 0)); + assert!(test_output[output_len..].iter().all(|&b| b == 0xff)); + + // The second XOR restores out the output. + test_impl.xof_xor( + &input_block, + input_len as u32, + &TEST_KEY, + counter, + flags, + &mut test_output[..output_len], + ); + assert_eq!(portable_output, test_output); + assert!(test_output[output_len..].iter().all(|&b| b == 0xff)); + } + } + } +} + +pub fn test_xof_vs_reference(test_impl: &Implementation) { + let input = b"hello world"; + let mut input_block = [0; BLOCK_LEN]; + input_block[..input.len()].copy_from_slice(input); + + const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; + let mut ref_output = [0; MAX_OUTPUT_LEN]; + let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + ref_hasher.update(input); + ref_hasher.finalize(&mut ref_output); + + // Try equal to and partway through every whole number of output blocks. + let mut output_lengths = vec![0, 1, 31]; + let mut next_len = BLOCK_LEN; + loop { + output_lengths.push(next_len); + if next_len == MAX_OUTPUT_LEN { + break; + } + output_lengths.push(next_len + 31); + next_len += BLOCK_LEN; + } + + for output_len in output_lengths { + dbg!(output_len); + let mut test_output = [0; MAX_OUTPUT_LEN]; + test_impl.xof( + &input_block, + input.len() as u32, + &TEST_KEY, + 0, + KEYED_HASH | CHUNK_START | CHUNK_END, + &mut test_output[..output_len], + ); + assert_eq!(ref_output[..output_len], test_output[..output_len]); + + // Double check that the implementation didn't overwrite. + assert!(test_output[output_len..].iter().all(|&b| b == 0)); + + // Do it again starting from block 1. + if output_len >= BLOCK_LEN { + test_impl.xof( + &input_block, + input.len() as u32, + &TEST_KEY, + 1, + KEYED_HASH | CHUNK_START | CHUNK_END, + &mut test_output[..output_len - BLOCK_LEN], + ); + assert_eq!( + ref_output[BLOCK_LEN..output_len], + test_output[..output_len - BLOCK_LEN], + ); + } + } +} + +pub fn test_universal_hash_vs_portable(test_impl: &Implementation) { + const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; + let mut input_buf = [0; MAX_INPUT_LEN]; + paint_test_input(&mut input_buf); + // Try equal to and partway through every whole number of input blocks. + let mut input_lengths = vec![0, 1, 31]; + let mut next_len = BLOCK_LEN; + loop { + input_lengths.push(next_len); + if next_len == MAX_INPUT_LEN { + break; + } + input_lengths.push(next_len + 31); + next_len += BLOCK_LEN; + } + for input_len in input_lengths { + dbg!(input_len); + for counter in INITIAL_COUNTERS { + dbg!(counter); + let portable_output = portable::implementation().universal_hash( + &input_buf[..input_len], + &TEST_KEY, + counter, + ); + let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, counter); + assert_eq!(portable_output, test_output); + } + } +} + +fn reference_impl_universal_hash(input: &[u8], key: &CVBytes) -> [u8; UNIVERSAL_HASH_LEN] { + // The reference_impl doesn't support XOF seeking, so we have to materialize an entire extended + // output to seek to a block. + const MAX_BLOCKS: usize = 2 * MAX_SIMD_DEGREE; + assert!(input.len() / BLOCK_LEN <= MAX_BLOCKS); + let mut output_buffer: [u8; BLOCK_LEN * MAX_BLOCKS] = [0u8; BLOCK_LEN * MAX_BLOCKS]; + let mut result = [0u8; UNIVERSAL_HASH_LEN]; + let mut block_start = 0; + while block_start < input.len() { + let block_len = cmp::min(input.len() - block_start, BLOCK_LEN); + let mut ref_hasher = reference_impl::Hasher::new_keyed(key); + ref_hasher.update(&input[block_start..block_start + block_len]); + ref_hasher.finalize(&mut output_buffer[..block_start + UNIVERSAL_HASH_LEN]); + for byte_index in 0..UNIVERSAL_HASH_LEN { + result[byte_index] ^= output_buffer[block_start + byte_index]; + } + block_start += BLOCK_LEN; + } + result +} + +pub fn test_universal_hash_vs_reference(test_impl: &Implementation) { + const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; + let mut input_buf = [0; MAX_INPUT_LEN]; + paint_test_input(&mut input_buf); + // Try equal to and partway through every whole number of input blocks. + let mut input_lengths = vec![0, 1, 31]; + let mut next_len = BLOCK_LEN; + loop { + input_lengths.push(next_len); + if next_len == MAX_INPUT_LEN { + break; + } + input_lengths.push(next_len + 31); + next_len += BLOCK_LEN; + } + for input_len in input_lengths { + dbg!(input_len); + let ref_output = reference_impl_universal_hash(&input_buf[..input_len], &TEST_KEY); + let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, 0); + assert_eq!(ref_output, test_output); + } +} diff --git a/third-party/blake3/src/io.rs b/third-party/blake3/src/io.rs new file mode 100644 index 00000000..1c19881e --- /dev/null +++ b/third-party/blake3/src/io.rs @@ -0,0 +1,79 @@ +//! Helper functions for efficient IO. + +#[cfg(feature = "std")] +pub(crate) fn copy_wide( + mut reader: impl std::io::Read, + hasher: &mut crate::Hasher, +) -> std::io::Result { + let mut buffer = [0; 65536]; + let mut total = 0; + loop { + match reader.read(&mut buffer) { + Ok(0) => return Ok(total), + Ok(n) => { + hasher.update(&buffer[..n]); + total += n as u64; + } + // see test_update_reader_interrupted + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + } + } +} + +// Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or +// if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it +// fails, return the error. +// +// SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like +// str::from_utf8 on them and then have them change out from under you. Letting a safe caller get +// their hands on an mmap, or even a &[u8] that's backed by an mmap, is unsound. However, because +// this function is crate-private, we can guarantee that all can ever happen in the event of a race +// condition is that we either hash nonsense bytes or crash with SIGBUS or similar, neither of +// which should risk memory corruption in a safe caller. +// +// PARANOIA: But a data race...is a data race...is a data race...right? Even if we know that no +// platform in the "real world" is ever going to do anything other than compute the "wrong answer" +// if we race on this mmap while we hash it, aren't we still supposed to feel bad about doing this? +// Well, maybe. This is IO, and IO gets special carve-outs in the memory model. Consider a +// memory-mapped register that returns random 32-bit words. (This is actually realistic if you have +// a hardware RNG.) It's probably sound to construct a *const i32 pointing to that register and do +// some raw pointer reads from it. Those reads should be volatile if you don't want the compiler to +// coalesce them, but either way the compiler isn't allowed to just _go nuts_ and insert +// should-never-happen branches to wipe your hard drive if two adjacent reads happen to give +// different values. As far as I'm aware, there's no such thing as a read that's allowed if it's +// volatile but prohibited if it's not (unlike atomics). As mentioned above, it's not ok to +// construct a safe &i32 to the register if you're going to leak that reference to unknown callers. +// But if you "know what you're doing," I don't think *const i32 and &i32 are fundamentally +// different here. Feedback needed. +#[cfg(feature = "mmap")] +pub(crate) fn maybe_mmap_file(file: &std::fs::File) -> std::io::Result> { + let metadata = file.metadata()?; + let file_size = metadata.len(); + #[allow(clippy::if_same_then_else)] + if !metadata.is_file() { + // Not a real file. + Ok(None) + } else if file_size > isize::max_value() as u64 { + // Too long to safely map. + // https://github.com/danburkert/memmap-rs/issues/69 + Ok(None) + } else if file_size == 0 { + // Mapping an empty file currently fails. + // https://github.com/danburkert/memmap-rs/issues/72 + // See test_mmap_virtual_file. + Ok(None) + } else if file_size < 16 * 1024 { + // Mapping small files is not worth it. + Ok(None) + } else { + // Explicitly set the length of the memory map, so that filesystem + // changes can't race to violate the invariants we just checked. + let map = unsafe { + memmap2::MmapOptions::new() + .len(file_size as usize) + .map(file)? + }; + Ok(Some(map)) + } +} diff --git a/third-party/blake3/src/lib.rs b/third-party/blake3/src/lib.rs index ac61fb27..d661cb2d 100644 --- a/third-party/blake3/src/lib.rs +++ b/third-party/blake3/src/lib.rs @@ -33,15 +33,33 @@ //! # Cargo Features //! //! The `std` feature (the only feature enabled by default) is required for -//! implementations of the [`Write`] and [`Seek`] traits, and also for runtime -//! CPU feature detection on x86. If this feature is disabled, the only way to -//! use the x86 SIMD implementations is to enable the corresponding instruction -//! sets globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting -//! binary will not be portable to other machines. +//! implementations of the [`Write`] and [`Seek`] traits, the +//! [`update_reader`](Hasher::update_reader) helper method, and runtime CPU +//! feature detection on x86. If this feature is disabled, the only way to use +//! the x86 SIMD implementations is to enable the corresponding instruction sets +//! globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting binary +//! will not be portable to other machines. //! //! The `rayon` feature (disabled by default, but enabled for [docs.rs]) adds -//! the [`Hasher::update_rayon`] method, for multithreaded hashing. However, -//! even if this feature is enabled, all other APIs remain single-threaded. +//! the [`update_rayon`](Hasher::update_rayon) and (in combination with `mmap` +//! below) [`update_mmap_rayon`](Hasher::update_mmap_rayon) methods, for +//! multithreaded hashing. However, even if this feature is enabled, all other +//! APIs remain single-threaded. +//! +//! The `mmap` feature (disabled by default, but enabled for [docs.rs]) adds the +//! [`update_mmap`](Hasher::update_mmap) and (in combination with `rayon` above) +//! [`update_mmap_rayon`](Hasher::update_mmap_rayon) helper methods for +//! memory-mapped IO. +//! +//! The `zeroize` feature (disabled by default, but enabled for [docs.rs]) +//! implements +//! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for +//! this crate's types. +//! +//! The `serde` feature (disabled by default, but enabled for [docs.rs]) implements +//! [`serde::Serialize`](https://docs.rs/serde/latest/serde/trait.Serialize.html) and +//! [`serde::Deserialize`](https://docs.rs/serde/latest/serde/trait.Deserialize.html) +//! for [`Hash`](struct@Hash). //! //! The NEON implementation is enabled by default for AArch64 but requires the //! `neon` feature for other ARM targets. Not all ARMv7 CPUs support NEON, and @@ -49,12 +67,12 @@ //! without NEON support. //! //! The `traits-preview` feature enables implementations of traits from the -//! RustCrypto [`digest`] crate, and re-exports that crate as -//! `traits::digest`. However, the traits aren't stable, and they're expected to -//! change in incompatible ways before that crate reaches 1.0. For that reason, -//! this crate makes no SemVer guarantees for this feature, and callers who use -//! it should expect breaking changes between patch versions. (The "-preview" -//! feature name follows the conventions of the RustCrypto [`signature`] crate.) +//! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`. +//! However, the traits aren't stable, and they're expected to change in +//! incompatible ways before that crate reaches 1.0. For that reason, this crate +//! makes no SemVer guarantees for this feature, and callers who use it should +//! expect breaking changes between patch versions. (The "-preview" feature name +//! follows the conventions of the RustCrypto [`signature`] crate.) //! //! [`Hasher::update_rayon`]: struct.Hasher.html#method.update_rayon //! [BLAKE3]: https://blake3.io @@ -112,6 +130,7 @@ mod sse41; #[cfg(feature = "traits-preview")] pub mod traits; +mod io; mod join; use arrayref::{array_mut_ref, array_ref}; @@ -197,6 +216,8 @@ fn counter_high(counter: u64) -> u32 { /// [`from_hex`]: #method.from_hex /// [`Display`]: https://doc.rust-lang.org/std/fmt/trait.Display.html /// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html +#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] +#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] #[derive(Clone, Copy, Hash)] pub struct Hash([u8; OUT_LEN]); @@ -284,10 +305,28 @@ impl core::str::FromStr for Hash { } } +// A proper implementation of constant time equality is tricky, and we get it from the +// constant_time_eq crate instead of rolling our own. However, that crate isn't compatible with +// Miri, so we roll our own just for that. +#[cfg(miri)] +fn constant_time_eq_miri(a: &[u8], b: &[u8]) -> bool { + if a.len() != b.len() { + return false; + } + let mut x = 0; + for i in 0..a.len() { + x |= a[i] ^ b[i]; + } + x == 0 +} + /// This implementation is constant-time. impl PartialEq for Hash { #[inline] fn eq(&self, other: &Hash) -> bool { + #[cfg(miri)] + return constant_time_eq_miri(&self.0, &other.0); + #[cfg(not(miri))] constant_time_eq::constant_time_eq_32(&self.0, &other.0) } } @@ -296,6 +335,9 @@ impl PartialEq for Hash { impl PartialEq<[u8; OUT_LEN]> for Hash { #[inline] fn eq(&self, other: &[u8; OUT_LEN]) -> bool { + #[cfg(miri)] + return constant_time_eq_miri(&self.0, other); + #[cfg(not(miri))] constant_time_eq::constant_time_eq_32(&self.0, other) } } @@ -304,6 +346,9 @@ impl PartialEq<[u8; OUT_LEN]> for Hash { impl PartialEq<[u8]> for Hash { #[inline] fn eq(&self, other: &[u8]) -> bool { + #[cfg(miri)] + return constant_time_eq_miri(&self.0, other); + #[cfg(not(miri))] constant_time_eq::constant_time_eq(&self.0, other) } } @@ -371,6 +416,7 @@ impl std::error::Error for HexError {} // Each chunk or parent node can produce either a 32-byte chaining value or, by // setting the ROOT flag, any number of final output bytes. The Output struct // captures the state just prior to choosing between those two possibilities. +#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] #[derive(Clone)] struct Output { input_chaining_value: CVWords, @@ -378,6 +424,7 @@ struct Output { block_len: u8, counter: u64, flags: u8, + #[cfg_attr(feature = "zeroize", zeroize(skip))] platform: Platform, } @@ -414,6 +461,7 @@ impl Output { } #[derive(Clone)] +#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] struct ChunkState { cv: CVWords, chunk_counter: u64, @@ -421,6 +469,7 @@ struct ChunkState { buf_len: u8, blocks_compressed: u8, flags: u8, + #[cfg_attr(feature = "zeroize", zeroize(skip))] platform: Platform, } @@ -903,6 +952,9 @@ fn parent_node_output( /// An incremental hash state that can accept any number of writes. /// +/// The `rayon` and `mmap` Cargo features enable additional methods on this +/// type related to multithreading and memory-mapped IO. +/// /// When the `traits-preview` Cargo feature is enabled, this type implements /// several commonly used traits from the /// [`digest`](https://crates.io/crates/digest) crate. However, those @@ -911,15 +963,6 @@ fn parent_node_output( /// guarantees for this feature, and callers who use it should expect breaking /// changes between patch versions. /// -/// When the `rayon` Cargo feature is enabled, the -/// [`update_rayon`](#method.update_rayon) method is available for multithreaded -/// hashing. -/// -/// **Performance note:** The [`update`](#method.update) method can't take full -/// advantage of SIMD optimizations if its input buffer is too small or oddly -/// sized. Using a 16 KiB buffer, or any multiple of that, enables all currently -/// supported SIMD instruction sets. -/// /// # Examples /// /// ``` @@ -942,6 +985,7 @@ fn parent_node_output( /// # } /// ``` #[derive(Clone)] +#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] pub struct Hasher { key: CVWords, chunk_state: ChunkState, @@ -1069,48 +1113,17 @@ impl Hasher { self.cv_stack.push(*new_cv); } - /// Add input bytes to the hash state. You can call this any number of - /// times. + /// Add input bytes to the hash state. You can call this any number of times. /// /// This method is always single-threaded. For multithreading support, see - /// [`update_rayon`](#method.update_rayon) below (enabled with the `rayon` - /// Cargo feature). - /// - /// Note that the degree of SIMD parallelism that `update` can use is - /// limited by the size of this input buffer. The 8 KiB buffer currently - /// used by [`std::io::copy`] is enough to leverage AVX2, for example, but - /// not enough to leverage AVX-512. A 16 KiB buffer is large enough to - /// leverage all currently supported SIMD instruction sets. + /// [`update_rayon`](#method.update_rayon) (enabled with the `rayon` Cargo feature). /// - /// [`std::io::copy`]: https://doc.rust-lang.org/std/io/fn.copy.html + /// Note that the degree of SIMD parallelism that `update` can use is limited by the size of + /// this input buffer. See [`update_reader`](#method.update_reader). pub fn update(&mut self, input: &[u8]) -> &mut Self { self.update_with_join::(input) } - /// Identical to [`update`](Hasher::update), but using Rayon-based - /// multithreading internally. - /// - /// This method is gated by the `rayon` Cargo feature, which is disabled by - /// default but enabled on [docs.rs](https://docs.rs). - /// - /// To get any performance benefit from multithreading, the input buffer - /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is - /// _slower_ than `update` for inputs under 128 KiB. That threshold varies - /// quite a lot across different processors, and it's important to benchmark - /// your specific use case. - /// - /// Memory mapping an entire input file is a simple way to take advantage of - /// multithreading without needing to carefully tune your buffer size or - /// offload IO. However, on spinning disks where random access is expensive, - /// that approach can lead to disk thrashing and terrible IO performance. - /// Note that OS page caching can mask this problem, in which case it might - /// only appear for files larger than available RAM. Again, benchmarking - /// your specific use case is important. - #[cfg(feature = "rayon")] - pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self { - self.update_with_join::(input) - } - fn update_with_join(&mut self, mut input: &[u8]) -> &mut Self { // If we have some partial chunk bytes in the internal chunk_state, we // need to finish that chunk first. @@ -1309,6 +1322,182 @@ impl Hasher { pub fn count(&self) -> u64 { self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64 } + + /// As [`update`](Hasher::update), but reading from a + /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) implementation. + /// + /// [`Hasher`] implements + /// [`std::io::Write`](https://doc.rust-lang.org/std/io/trait.Write.html), so it's possible to + /// use [`std::io::copy`](https://doc.rust-lang.org/std/io/fn.copy.html) to update a [`Hasher`] + /// from any reader. Unfortunately, this standard approach can limit performance, because + /// `copy` currently uses an internal 8 KiB buffer that isn't big enough to take advantage of + /// all SIMD instruction sets. (In particular, [AVX-512](https://en.wikipedia.org/wiki/AVX-512) + /// needs a 16 KiB buffer.) `update_reader` avoids this performance problem and is slightly + /// more convenient. + /// + /// The internal buffer size this method uses may change at any time, and it may be different + /// for different targets. The only guarantee is that it will be large enough for all of this + /// crate's SIMD implementations on the current platform. + /// + /// The most common implementer of + /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) might be + /// [`std::fs::File`](https://doc.rust-lang.org/std/fs/struct.File.html), but note that memory + /// mapping can be faster than this method for hashing large files. See + /// [`update_mmap`](Hasher::update_mmap) and [`update_mmap_rayon`](Hasher::update_mmap_rayon), + /// which require the `mmap` and (for the latter) `rayon` Cargo features. + /// + /// This method requires the `std` Cargo feature, which is enabled by default. + /// + /// # Example + /// + /// ```no_run + /// # use std::fs::File; + /// # use std::io; + /// # fn main() -> io::Result<()> { + /// // Hash standard input. + /// let mut hasher = blake3::Hasher::new(); + /// hasher.update_reader(std::io::stdin().lock())?; + /// println!("{}", hasher.finalize()); + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "std")] + pub fn update_reader(&mut self, reader: impl std::io::Read) -> std::io::Result<&mut Self> { + io::copy_wide(reader, self)?; + Ok(self) + } + + /// As [`update`](Hasher::update), but using Rayon-based multithreading + /// internally. + /// + /// This method is gated by the `rayon` Cargo feature, which is disabled by + /// default but enabled on [docs.rs](https://docs.rs). + /// + /// To get any performance benefit from multithreading, the input buffer + /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is + /// _slower_ than `update` for inputs under 128 KiB. That threshold varies + /// quite a lot across different processors, and it's important to benchmark + /// your specific use case. See also the performance warning associated with + /// [`update_mmap_rayon`](Hasher::update_mmap_rayon). + /// + /// If you already have a large buffer in memory, and you want to hash it + /// with multiple threads, this method is a good option. However, reading a + /// file into memory just to call this method can be a performance mistake, + /// both because it requires lots of memory and because single-threaded + /// reads can be slow. For hashing whole files, see + /// [`update_mmap_rayon`](Hasher::update_mmap_rayon), which is gated by both + /// the `rayon` and `mmap` Cargo features. + #[cfg(feature = "rayon")] + pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self { + self.update_with_join::(input) + } + + /// As [`update`](Hasher::update), but reading the contents of a file using memory mapping. + /// + /// Not all files can be memory mapped, and memory mapping small files can be slower than + /// reading them the usual way. In those cases, this method will fall back to standard file IO. + /// The heuristic for whether to use memory mapping is currently very simple (file size >= + /// 16 KiB), and it might change at any time. + /// + /// Like [`update`](Hasher::update), this method is single-threaded. In this author's + /// experience, memory mapping improves single-threaded performance by ~10% for large files + /// that are already in cache. This probably varies between platforms, and as always it's a + /// good idea to benchmark your own use case. In comparison, the multithreaded + /// [`update_mmap_rayon`](Hasher::update_mmap_rayon) method can have a much larger impact on + /// performance. + /// + /// There's a correctness reason that this method takes + /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) instead of + /// [`File`](https://doc.rust-lang.org/std/fs/struct.File.html): reading from a memory-mapped + /// file ignores the seek position of the original file handle (it neither respects the current + /// position nor updates the position). This difference in behavior would've caused + /// `update_mmap` and [`update_reader`](Hasher::update_reader) to give different answers and + /// have different side effects in some cases. Taking a + /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) avoids this problem by + /// making it clear that a new [`File`](https://doc.rust-lang.org/std/fs/struct.File.html) is + /// opened internally. + /// + /// This method requires the `mmap` Cargo feature, which is disabled by default but enabled on + /// [docs.rs](https://docs.rs). + /// + /// # Example + /// + /// ```no_run + /// # use std::io; + /// # use std::path::Path; + /// # fn main() -> io::Result<()> { + /// let path = Path::new("file.dat"); + /// let mut hasher = blake3::Hasher::new(); + /// hasher.update_mmap(path)?; + /// println!("{}", hasher.finalize()); + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "mmap")] + pub fn update_mmap(&mut self, path: impl AsRef) -> std::io::Result<&mut Self> { + let file = std::fs::File::open(path.as_ref())?; + if let Some(mmap) = io::maybe_mmap_file(&file)? { + self.update(&mmap); + } else { + io::copy_wide(&file, self)?; + } + Ok(self) + } + + /// As [`update_rayon`](Hasher::update_rayon), but reading the contents of a file using + /// memory mapping. This is the default behavior of `b3sum`. + /// + /// For large files that are likely to be in cache, this can be much faster than + /// single-threaded hashing. When benchmarks report that BLAKE3 is 10x or 20x faster than other + /// cryptographic hashes, this is usually what they're measuring. However... + /// + /// **Performance Warning:** There are cases where multithreading hurts performance. The worst + /// case is [a large file on a spinning disk](https://github.com/BLAKE3-team/BLAKE3/issues/31), + /// where simultaneous reads from multiple threads can cause "thrashing" (i.e. the disk spends + /// more time seeking around than reading data). Windows tends to be somewhat worse about this, + /// in part because it's less likely than Linux to keep very large files in cache. More + /// generally, if your CPU cores are already busy, then multithreading will add overhead + /// without improving performance. If your code runs in different environments that you don't + /// control and can't measure, then unfortunately there's no one-size-fits-all answer for + /// whether multithreading is a good idea. + /// + /// The memory mapping behavior of this function is the same as + /// [`update_mmap`](Hasher::update_mmap), and the heuristic for when to fall back to standard + /// file IO might change at any time. + /// + /// This method requires both the `mmap` and `rayon` Cargo features, which are disabled by + /// default but enabled on [docs.rs](https://docs.rs). + /// + /// # Example + /// + /// ```no_run + /// # use std::io; + /// # use std::path::Path; + /// # fn main() -> io::Result<()> { + /// # #[cfg(feature = "rayon")] + /// # { + /// let path = Path::new("big_file.dat"); + /// let mut hasher = blake3::Hasher::new(); + /// hasher.update_mmap_rayon(path)?; + /// println!("{}", hasher.finalize()); + /// # } + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "mmap")] + #[cfg(feature = "rayon")] + pub fn update_mmap_rayon( + &mut self, + path: impl AsRef, + ) -> std::io::Result<&mut Self> { + let file = std::fs::File::open(path.as_ref())?; + if let Some(mmap) = io::maybe_mmap_file(&file)? { + self.update_rayon(&mmap); + } else { + io::copy_wide(&file, self)?; + } + Ok(self) + } } // Don't derive(Debug), because the state may be secret. @@ -1366,6 +1555,7 @@ impl std::io::Write for Hasher { /// from an unknown position in the output stream to recover its block index. Callers with strong /// secret keys aren't affected in practice, but secret offsets are a [design /// smell](https://en.wikipedia.org/wiki/Design_smell) in any case. +#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] #[derive(Clone)] pub struct OutputReader { inner: Output, diff --git a/third-party/blake3/src/platform.rs b/third-party/blake3/src/platform.rs index 00058b16..79bc9a3f 100644 --- a/third-party/blake3/src/platform.rs +++ b/third-party/blake3/src/platform.rs @@ -56,6 +56,11 @@ pub enum Platform { impl Platform { #[allow(unreachable_code)] pub fn detect() -> Self { + #[cfg(miri)] + { + return Platform::Portable; + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { #[cfg(blake3_avx512_ffi)] @@ -327,7 +332,12 @@ impl Platform { #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] +#[allow(unreachable_code)] pub fn avx512_detected() -> bool { + if cfg!(miri) { + return false; + } + // A testing-only short-circuit. if cfg!(feature = "no_avx512") { return false; @@ -349,7 +359,12 @@ pub fn avx512_detected() -> bool { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] +#[allow(unreachable_code)] pub fn avx2_detected() -> bool { + if cfg!(miri) { + return false; + } + // A testing-only short-circuit. if cfg!(feature = "no_avx2") { return false; @@ -371,7 +386,12 @@ pub fn avx2_detected() -> bool { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] +#[allow(unreachable_code)] pub fn sse41_detected() -> bool { + if cfg!(miri) { + return false; + } + // A testing-only short-circuit. if cfg!(feature = "no_sse41") { return false; @@ -395,6 +415,10 @@ pub fn sse41_detected() -> bool { #[inline(always)] #[allow(unreachable_code)] pub fn sse2_detected() -> bool { + if cfg!(miri) { + return false; + } + // A testing-only short-circuit. if cfg!(feature = "no_sse2") { return false; diff --git a/third-party/blake3/src/test.rs b/third-party/blake3/src/test.rs index 60bbe8cc..c76cbbc0 100644 --- a/third-party/blake3/src/test.rs +++ b/third-party/blake3/src/test.rs @@ -628,3 +628,211 @@ const fn test_hash_const_conversions() { let hash = crate::Hash::from_bytes(bytes); _ = hash.as_bytes(); } + +#[cfg(feature = "zeroize")] +#[test] +fn test_zeroize() { + use zeroize::Zeroize; + + let mut hash = crate::Hash([42; 32]); + hash.zeroize(); + assert_eq!(hash.0, [0u8; 32]); + + let mut hasher = crate::Hasher { + chunk_state: crate::ChunkState { + cv: [42; 8], + chunk_counter: 42, + buf: [42; 64], + buf_len: 42, + blocks_compressed: 42, + flags: 42, + platform: crate::Platform::Portable, + }, + key: [42; 8], + cv_stack: [[42; 32]; { crate::MAX_DEPTH + 1 }].into(), + }; + hasher.zeroize(); + assert_eq!(hasher.chunk_state.cv, [0; 8]); + assert_eq!(hasher.chunk_state.chunk_counter, 0); + assert_eq!(hasher.chunk_state.buf, [0; 64]); + assert_eq!(hasher.chunk_state.buf_len, 0); + assert_eq!(hasher.chunk_state.blocks_compressed, 0); + assert_eq!(hasher.chunk_state.flags, 0); + assert!(matches!( + hasher.chunk_state.platform, + crate::Platform::Portable + )); + assert_eq!(hasher.key, [0; 8]); + assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]); + + let mut output_reader = crate::OutputReader { + inner: crate::Output { + input_chaining_value: [42; 8], + block: [42; 64], + counter: 42, + block_len: 42, + flags: 42, + platform: crate::Platform::Portable, + }, + position_within_block: 42, + }; + + output_reader.zeroize(); + assert_eq!(output_reader.inner.input_chaining_value, [0; 8]); + assert_eq!(output_reader.inner.block, [0; 64]); + assert_eq!(output_reader.inner.counter, 0); + assert_eq!(output_reader.inner.block_len, 0); + assert_eq!(output_reader.inner.flags, 0); + assert!(matches!( + output_reader.inner.platform, + crate::Platform::Portable + )); + assert_eq!(output_reader.position_within_block, 0); +} + +#[test] +#[cfg(feature = "std")] +fn test_update_reader() -> Result<(), std::io::Error> { + // This is a brief test, since update_reader() is mostly a wrapper around update(), which already + // has substantial testing. + let mut input = vec![0; 1_000_000]; + paint_test_input(&mut input); + assert_eq!( + crate::Hasher::new().update_reader(&input[..])?.finalize(), + crate::hash(&input), + ); + Ok(()) +} + +#[test] +#[cfg(feature = "std")] +fn test_update_reader_interrupted() -> std::io::Result<()> { + use std::io; + struct InterruptingReader<'a> { + already_interrupted: bool, + slice: &'a [u8], + } + impl<'a> InterruptingReader<'a> { + fn new(slice: &'a [u8]) -> Self { + Self { + already_interrupted: false, + slice, + } + } + } + impl<'a> io::Read for InterruptingReader<'a> { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if !self.already_interrupted { + self.already_interrupted = true; + return Err(io::Error::from(io::ErrorKind::Interrupted)); + } + let take = std::cmp::min(self.slice.len(), buf.len()); + buf[..take].copy_from_slice(&self.slice[..take]); + self.slice = &self.slice[take..]; + Ok(take) + } + } + + let input = b"hello world"; + let mut reader = InterruptingReader::new(input); + let mut hasher = crate::Hasher::new(); + hasher.update_reader(&mut reader)?; + assert_eq!(hasher.finalize(), crate::hash(input)); + Ok(()) +} + +#[test] +#[cfg(feature = "mmap")] +// NamedTempFile isn't Miri-compatible +#[cfg(not(miri))] +fn test_mmap() -> Result<(), std::io::Error> { + // This is a brief test, since update_mmap() is mostly a wrapper around update(), which already + // has substantial testing. + use std::io::prelude::*; + let mut input = vec![0; 1_000_000]; + paint_test_input(&mut input); + let mut tempfile = tempfile::NamedTempFile::new()?; + tempfile.write_all(&input)?; + tempfile.flush()?; + assert_eq!( + crate::Hasher::new() + .update_mmap(tempfile.path())? + .finalize(), + crate::hash(&input), + ); + Ok(()) +} + +#[test] +#[cfg(feature = "mmap")] +#[cfg(target_os = "linux")] +fn test_mmap_virtual_file() -> Result<(), std::io::Error> { + // Virtual files like /proc/version can't be mmapped, because their contents don't actually + // exist anywhere in memory. Make sure we fall back to regular file IO in these cases. + // Currently this is handled with a length check, where the assumption is that virtual files + // will always report length 0. If that assumption ever breaks, hopefully this test will catch + // it. + let virtual_filepath = "/proc/version"; + let mut mmap_hasher = crate::Hasher::new(); + // We'll fail right here if the fallback doesn't work. + mmap_hasher.update_mmap(virtual_filepath)?; + let mut read_hasher = crate::Hasher::new(); + read_hasher.update_reader(std::fs::File::open(virtual_filepath)?)?; + assert_eq!(mmap_hasher.finalize(), read_hasher.finalize()); + Ok(()) +} + +#[test] +#[cfg(feature = "mmap")] +#[cfg(feature = "rayon")] +// NamedTempFile isn't Miri-compatible +#[cfg(not(miri))] +fn test_mmap_rayon() -> Result<(), std::io::Error> { + // This is a brief test, since update_mmap_rayon() is mostly a wrapper around update_rayon(), + // which already has substantial testing. + use std::io::prelude::*; + let mut input = vec![0; 1_000_000]; + paint_test_input(&mut input); + let mut tempfile = tempfile::NamedTempFile::new()?; + tempfile.write_all(&input)?; + tempfile.flush()?; + assert_eq!( + crate::Hasher::new() + .update_mmap_rayon(tempfile.path())? + .finalize(), + crate::hash(&input), + ); + Ok(()) +} + +#[test] +#[cfg(feature = "std")] +#[cfg(feature = "serde")] +fn test_serde() { + let hash: crate::Hash = [7; 32].into(); + let json = serde_json::to_string(&hash).unwrap(); + assert_eq!( + json, + "[7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]", + ); + let hash2: crate::Hash = serde_json::from_str(&json).unwrap(); + assert_eq!(hash, hash2); +} + +// `cargo +nightly miri test` currently works, but it takes forever, because some of our test +// inputs are quite large. Most of our unsafe code is platform specific and incompatible with Miri +// anyway, but we'd like it to be possible for callers to run their own tests under Miri, assuming +// they don't use incompatible features like Rayon or mmap. This test should get reasonable +// coverage of our public API without using any large inputs, so we can run it in CI and catch +// obvious breaks. (For example, constant_time_eq is not compatible with Miri.) +#[test] +fn test_miri_smoketest() { + let mut hasher = crate::Hasher::new_derive_key("Miri smoketest"); + hasher.update(b"foo"); + #[cfg(feature = "std")] + hasher.update_reader(&b"bar"[..]).unwrap(); + assert_eq!(hasher.finalize(), hasher.finalize()); + let mut reader = hasher.finalize_xof(); + reader.set_position(999999); + reader.fill(&mut [0]); +} diff --git a/third-party/blake3/tools/release.md b/third-party/blake3/tools/release.md index 17a07b0f..924f3279 100644 --- a/third-party/blake3/tools/release.md +++ b/third-party/blake3/tools/release.md @@ -4,7 +4,7 @@ - Bump the version in the root Cargo.toml. - Bump the version in b3sum/Cargo.toml. - Delete b3sum/Cargo.lock and recreate it with `cargo build` or similar. -- Update the `--help` output in b3sum/README.md if it's changed. +- Update the `-h` output in b3sum/README.md if it's changed. - Bump `BLAKE3_VERSION_STRING` in c/blake3.h. - Bump `VERSION` in c/CMakeLists.txt. - Make a version bump commit with change notes. diff --git a/third-party/mimalloc/.gitignore b/third-party/mimalloc/.gitignore index f8b7f5eb..df1d58eb 100644 --- a/third-party/mimalloc/.gitignore +++ b/third-party/mimalloc/.gitignore @@ -7,3 +7,5 @@ ide/vs20??/VTune* out/ docs/ *.zip +*.tar +*.gz diff --git a/third-party/mimalloc/CMakeLists.txt b/third-party/mimalloc/CMakeLists.txt index 2bcd1ef7..bcfe91d8 100644 --- a/third-party/mimalloc/CMakeLists.txt +++ b/third-party/mimalloc/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.13) +cmake_minimum_required(VERSION 3.18) project(libmimalloc C CXX) set(CMAKE_C_STANDARD 11) @@ -19,6 +19,7 @@ option(MI_OSX_INTERPOSE "Use interpose to override standard malloc on macOS" option(MI_OSX_ZONE "Use malloc zone to override standard malloc on macOS" ON) option(MI_WIN_REDIRECT "Use redirection module ('mimalloc-redirect') on Windows if compiling mimalloc as a DLL" ON) option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF) +option(MI_LIBC_MUSL "Set this when linking with musl libc" OFF) option(MI_BUILD_SHARED "Build shared library" ON) option(MI_BUILD_STATIC "Build static library" ON) option(MI_BUILD_OBJECT "Build object library" ON) @@ -27,12 +28,14 @@ option(MI_DEBUG_TSAN "Build with thread sanitizer (needs clang)" OFF) option(MI_DEBUG_UBSAN "Build with undefined-behavior sanitizer (needs clang++)" OFF) option(MI_SKIP_COLLECT_ON_EXIT "Skip collecting memory on program exit" OFF) option(MI_NO_PADDING "Force no use of padding even in DEBUG mode etc." OFF) +option(MI_INSTALL_TOPLEVEL "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version" OFF) +option(MI_NO_THP "Disable transparent huge pages support on Linux/Android for the mimalloc process only" OFF) # deprecated options option(MI_CHECK_FULL "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF) -option(MI_INSTALL_TOPLEVEL "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version (deprecated)" OFF) option(MI_USE_LIBATOMIC "Explicitly link with -latomic (on older systems) (deprecated and detected automatically)" OFF) +include(CheckLinkerFlag) # requires cmake 3.18 include(CheckIncludeFiles) include(GNUInstallDirs) include("cmake/mimalloc-config-version.cmake") @@ -45,6 +48,7 @@ set(mi_sources src/bitmap.c src/heap.c src/init.c + src/libc.c src/options.c src/os.c src/page.c @@ -55,6 +59,9 @@ set(mi_sources src/prim/prim.c) set(mi_cflags "") +set(mi_cflags_static "") # extra flags for a static library build +set(mi_cflags_dynamic "") # extra flags for a shared-object library build +set(mi_defines "") set(mi_libraries "") # ----------------------------------------------------------------------------- @@ -82,6 +89,17 @@ endif() # Process options # ----------------------------------------------------------------------------- +# put -Wall early so other warnings can be disabled selectively +if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang") + list(APPEND mi_cflags -Wall -Wextra -Wpedantic) +endif() +if(CMAKE_C_COMPILER_ID MATCHES "GNU") + list(APPEND mi_cflags -Wall -Wextra) +endif() +if(CMAKE_C_COMPILER_ID MATCHES "Intel") + list(APPEND mi_cflags -Wall) +endif() + if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel") set(MI_USE_CXX "ON") endif() @@ -127,7 +145,7 @@ endif() if(MI_SECURE) message(STATUS "Set full secure build (MI_SECURE=ON)") - list(APPEND mi_defines MI_SECURE=4) + list(APPEND mi_defines MI_SECURE=4) endif() if(MI_TRACK_VALGRIND) @@ -184,6 +202,10 @@ endif() if(MI_SEE_ASM) message(STATUS "Generate assembly listings (MI_SEE_ASM=ON)") list(APPEND mi_cflags -save-temps) + if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang") + message(STATUS "No GNU Line marker") + list(APPEND mi_cflags -Wno-gnu-line-marker) + endif() endif() if(MI_CHECK_FULL) @@ -246,7 +268,7 @@ if(MI_DEBUG_UBSAN) message(WARNING "Can only use undefined-behavior sanitizer with clang++ (MI_DEBUG_UBSAN=ON but ignored)") endif() else() - message(WARNING "Can only use thread sanitizer with a debug build (CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})") + message(WARNING "Can only use undefined-behavior sanitizer with a debug build (CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})") endif() endif() @@ -262,31 +284,51 @@ if(MI_USE_CXX) endif() endif() -if(CMAKE_SYSTEM_NAME MATCHES "Haiku") - SET(CMAKE_INSTALL_LIBDIR ~/config/non-packaged/lib) - SET(CMAKE_INSTALL_INCLUDEDIR ~/config/non-packaged/headers) - endif() +if(CMAKE_SYSTEM_NAME MATCHES "Linux|Android") + if(MI_NO_THP) + message(STATUS "Disable transparent huge pages support (MI_NO_THP=ON)") + list(APPEND mi_defines MI_NO_THP=1) + endif() +endif() + +if(MI_LIBC_MUSL) + message(STATUS "Assume using musl libc (MI_LIBC_MUSL=ON)") + list(APPEND mi_defines MI_LIBC_MUSL=1) +endif() + +# On Haiku use `-DCMAKE_INSTALL_PREFIX` instead, issue #788 +# if(CMAKE_SYSTEM_NAME MATCHES "Haiku") +# SET(CMAKE_INSTALL_LIBDIR ~/config/non-packaged/lib) +# SET(CMAKE_INSTALL_INCLUDEDIR ~/config/non-packaged/headers) +# endif() # Compiler flags if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU") - list(APPEND mi_cflags -Wall -Wextra -Wno-unknown-pragmas -fvisibility=hidden) + list(APPEND mi_cflags -Wno-unknown-pragmas -fvisibility=hidden) if(NOT MI_USE_CXX) list(APPEND mi_cflags -Wstrict-prototypes) endif() if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang") - list(APPEND mi_cflags -Wpedantic -Wno-static-in-inline) + list(APPEND mi_cflags -Wno-static-in-inline) endif() endif() if(CMAKE_C_COMPILER_ID MATCHES "Intel") - list(APPEND mi_cflags -Wall -fvisibility=hidden) + list(APPEND mi_cflags -fvisibility=hidden) endif() if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku") if(MI_LOCAL_DYNAMIC_TLS) list(APPEND mi_cflags -ftls-model=local-dynamic) else() - list(APPEND mi_cflags -ftls-model=initial-exec) + if(MI_LIBC_MUSL) + # with musl we use local-dynamic for the static build, see issue #644 + list(APPEND mi_cflags_static -ftls-model=local-dynamic) + list(APPEND mi_cflags_dynamic -ftls-model=initial-exec) + message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)") + else() + list(APPEND mi_cflags -ftls-model=initial-exec) + endif() endif() if(MI_OVERRIDE) list(APPEND mi_cflags -fno-builtin-malloc) @@ -297,29 +339,45 @@ if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914) list(APPEND mi_cflags /Zc:__cplusplus) endif() +if(MINGW) + add_definitions(-D_WIN32_WINNT=0x600) +endif() + # extra needed libraries + +# we prefer -l test over `find_library` as sometimes core libraries +# like `libatomic` are not on the system path (see issue #898) +function(find_link_library libname outlibname) + check_linker_flag(C "-l${libname}" mi_has_lib${libname}) + if (mi_has_lib${libname}) + message(VERBOSE "link library: -l${libname}") + set(${outlibname} ${libname} PARENT_SCOPE) + else() + find_library(MI_LIBPATH libname) + if (MI_LIBPATH) + message(VERBOSE "link library ${libname} at ${MI_LIBPATH}") + set(${outlibname} ${MI_LIBPATH} PARENT_SCOPE) + else() + message(VERBOSE "link library not found: ${libname}") + set(${outlibname} "" PARENT_SCOPE) + endif() + endif() +endfunction() + if(WIN32) - list(APPEND mi_libraries psapi shell32 user32 advapi32 bcrypt) - set(pc_libraries "-lpsapi -lshell32 -luser32 -ladvapi32 -lbcrypt") + list(APPEND mi_libraries psapi shell32 user32 advapi32 bcrypt) else() - set(pc_libraries "") - find_library(MI_LIBPTHREAD pthread) - if (MI_LIBPTHREAD) - list(APPEND mi_libraries ${MI_LIBPTHREAD}) - set(pc_libraries "${pc_libraries} -pthread") - endif() - find_library(MI_LIBRT rt) - if(MI_LIBRT) - list(APPEND mi_libraries ${MI_LIBRT}) - set(pc_libraries "${pc_libraries} -lrt") + find_link_library("pthread" MI_LIB_PTHREAD) + if(MI_LIB_PTHREAD) + list(APPEND mi_libraries "${MI_LIB_PTHREAD}") endif() - find_library(MI_LIBATOMIC atomic) - if (NOT MI_LIBATOMIC AND MI_USE_LIBATOMIC) - set(MI_LIBATOMIC atomic) + find_link_library("rt" MI_LIB_RT) + if(MI_LIB_RT) + list(APPEND mi_libraries "${MI_LIB_RT}") endif() - if (MI_LIBATOMIC) - list(APPEND mi_libraries ${MI_LIBATOMIC}) - set(pc_libraries "${pc_libraries} -latomic") + find_link_library("atomic" MI_LIB_ATOMIC) + if(MI_LIB_ATOMIC) + list(APPEND mi_libraries "${MI_LIB_ATOMIC}") endif() endif() @@ -328,7 +386,8 @@ endif() # ----------------------------------------------------------------------------- # dynamic/shared library and symlinks always go to /usr/local/lib equivalent -set(mi_install_libdir "${CMAKE_INSTALL_LIBDIR}") +set(mi_install_libdir "${CMAKE_INSTALL_LIBDIR}") +set(mi_install_bindir "${CMAKE_INSTALL_BINDIR}") # static libraries and object files, includes, and cmake config files # are either installed at top level, or use versioned directories for side-by-side installation (default) @@ -394,7 +453,7 @@ if(MI_BUILD_SHARED) add_library(mimalloc SHARED ${mi_sources}) set_target_properties(mimalloc PROPERTIES VERSION ${mi_version} SOVERSION ${mi_version_major} OUTPUT_NAME ${mi_basename} ) target_compile_definitions(mimalloc PRIVATE ${mi_defines} MI_SHARED_LIB MI_SHARED_LIB_EXPORT) - target_compile_options(mimalloc PRIVATE ${mi_cflags}) + target_compile_options(mimalloc PRIVATE ${mi_cflags} ${mi_cflags_dynamic}) target_link_libraries(mimalloc PRIVATE ${mi_libraries}) target_include_directories(mimalloc PUBLIC $ @@ -412,10 +471,10 @@ if(MI_BUILD_SHARED) add_custom_command(TARGET mimalloc POST_BUILD COMMAND "${CMAKE_COMMAND}" -E copy "${CMAKE_CURRENT_SOURCE_DIR}/bin/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll" $ COMMENT "Copy mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll to output directory") - install(FILES "$/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll" DESTINATION ${mi_install_libdir}) + install(FILES "$/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll" DESTINATION ${mi_install_bindir}) endif() - install(TARGETS mimalloc EXPORT mimalloc DESTINATION ${mi_install_libdir} LIBRARY) + install(TARGETS mimalloc EXPORT mimalloc ARCHIVE DESTINATION ${mi_install_libdir} RUNTIME DESTINATION ${mi_install_bindir} LIBRARY DESTINATION ${mi_install_libdir}) install(EXPORT mimalloc DESTINATION ${mi_install_cmakedir}) endif() @@ -424,7 +483,7 @@ if (MI_BUILD_STATIC) add_library(mimalloc-static STATIC ${mi_sources}) set_property(TARGET mimalloc-static PROPERTY POSITION_INDEPENDENT_CODE ON) target_compile_definitions(mimalloc-static PRIVATE ${mi_defines} MI_STATIC_LIB) - target_compile_options(mimalloc-static PRIVATE ${mi_cflags}) + target_compile_options(mimalloc-static PRIVATE ${mi_cflags} ${mi_cflags_static}) target_link_libraries(mimalloc-static PRIVATE ${mi_libraries}) target_include_directories(mimalloc-static PUBLIC $ @@ -456,7 +515,7 @@ if (MI_BUILD_OBJECT) add_library(mimalloc-obj OBJECT src/static.c) set_property(TARGET mimalloc-obj PROPERTY POSITION_INDEPENDENT_CODE ON) target_compile_definitions(mimalloc-obj PRIVATE ${mi_defines}) - target_compile_options(mimalloc-obj PRIVATE ${mi_cflags}) + target_compile_options(mimalloc-obj PRIVATE ${mi_cflags} ${mi_cflags_static}) target_include_directories(mimalloc-obj PUBLIC $ $ @@ -467,7 +526,7 @@ if (MI_BUILD_OBJECT) set(mimalloc-obj-static "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/mimalloc-obj.dir/src/static.c${CMAKE_C_OUTPUT_EXTENSION}") set(mimalloc-obj-out "${CMAKE_CURRENT_BINARY_DIR}/${mi_basename}${CMAKE_C_OUTPUT_EXTENSION}") add_custom_command(OUTPUT ${mimalloc-obj-out} DEPENDS mimalloc-obj COMMAND "${CMAKE_COMMAND}" -E copy "${mimalloc-obj-static}" "${mimalloc-obj-out}") - add_custom_target(mimalloc-obj-target ALL DEPENDS ${mimalloc-obj-out}) + add_custom_target(mimalloc-obj-target ALL DEPENDS ${mimalloc-obj-out}) endif() # the following seems to lead to cmake warnings/errors on some systems, disable for now :-( @@ -481,6 +540,15 @@ if (MI_BUILD_OBJECT) endif() # pkg-config file support +set(pc_libraries "") +foreach(item IN LISTS mi_libraries) + if(item MATCHES " *[-].*") + set(pc_libraries "${pc_libraries} ${item}") + else() + set(pc_libraries "${pc_libraries} -l${item}") + endif() +endforeach() + include("cmake/JoinPaths.cmake") join_paths(includedir_for_pc_file "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}") join_paths(libdir_for_pc_file "\${prefix}" "${CMAKE_INSTALL_LIBDIR}") @@ -489,6 +557,8 @@ configure_file(mimalloc.pc.in mimalloc.pc @ONLY) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mimalloc.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") + + # ----------------------------------------------------------------------------- # API surface testing # ----------------------------------------------------------------------------- diff --git a/third-party/mimalloc/SECURITY.md b/third-party/mimalloc/SECURITY.md new file mode 100644 index 00000000..b3c89efc --- /dev/null +++ b/third-party/mimalloc/SECURITY.md @@ -0,0 +1,41 @@ + + +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** + +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). + +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). + + diff --git a/third-party/mimalloc/bin/readme.md b/third-party/mimalloc/bin/readme.md new file mode 100644 index 00000000..9b121bda --- /dev/null +++ b/third-party/mimalloc/bin/readme.md @@ -0,0 +1,71 @@ +# Windows Override + +Dynamically overriding on mimalloc on Windows +is robust and has the particular advantage to be able to redirect all malloc/free calls that go through +the (dynamic) C runtime allocator, including those from other DLL's or libraries. +As it intercepts all allocation calls on a low level, it can be used reliably +on large programs that include other 3rd party components. +There are four requirements to make the overriding work robustly: + +1. Use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch). + +2. Link your program explicitly with `mimalloc-override.dll` library. + To ensure the `mimalloc-override.dll` is loaded at run-time it is easiest to insert some + call to the mimalloc API in the `main` function, like `mi_version()` + (or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project + for an example on how to use this. + +3. The `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) must be put + in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency of that DLL). + The redirection DLL ensures that all calls to the C runtime malloc API get redirected to + mimalloc functions (which reside in `mimalloc-override.dll`). + +4. Ensure the `mimalloc-override.dll` comes as early as possible in the import + list of the final executable (so it can intercept all potential allocations). + +For best performance on Windows with C++, it +is also recommended to also override the `new`/`delete` operations (by including +[`mimalloc-new-delete.h`](../include/mimalloc-new-delete.h) +a single(!) source file in your project). + +The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic +overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected. + +## Minject + +We cannot always re-link an executable with `mimalloc-override.dll`, and similarly, we cannot always +ensure the the DLL comes first in the import table of the final executable. +In many cases though we can patch existing executables without any recompilation +if they are linked with the dynamic C runtime (`ucrtbase.dll`) -- just put the `mimalloc-override.dll` +into the import table (and put `mimalloc-redirect.dll` in the same folder) +Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388). + +The `minject` program can also do this from the command line, use `minject --help` for options: + +``` +> minject --help + +minject: + Injects the mimalloc dll into the import table of a 64-bit executable, + and/or ensures that it comes first in het import table. + +usage: + > minject [options] + +options: + -h --help show this help + -v --verbose be verbose + -l --list only list imported modules + -i --inplace update the exe in-place (make sure there is a backup!) + -f --force always overwrite without prompting + --postfix=

use

as a postfix to the mimalloc dll (default is 'override') + e.g. use --postfix=override-debug to link with mimalloc-override-debug.dll + +notes: + Without '--inplace' an injected is generated with the same name ending in '-mi'. + Ensure 'mimalloc-redirect.dll' is in the same folder as the mimalloc dll. + +examples: + > minject --list myprogram.exe + > minject --force --inplace myprogram.exe +``` diff --git a/third-party/mimalloc/cmake/mimalloc-config-version.cmake b/third-party/mimalloc/cmake/mimalloc-config-version.cmake index a44c121d..81fd3c9d 100644 --- a/third-party/mimalloc/cmake/mimalloc-config-version.cmake +++ b/third-party/mimalloc/cmake/mimalloc-config-version.cmake @@ -1,6 +1,6 @@ set(mi_version_major 2) set(mi_version_minor 1) -set(mi_version_patch 2) +set(mi_version_patch 7) set(mi_version ${mi_version_major}.${mi_version_minor}) set(PACKAGE_VERSION ${mi_version}) diff --git a/third-party/mimalloc/doc/doxyfile b/third-party/mimalloc/doc/doxyfile index 55cae8bf..d03a70f5 100644 --- a/third-party/mimalloc/doc/doxyfile +++ b/third-party/mimalloc/doc/doxyfile @@ -466,7 +466,7 @@ LOOKUP_CACHE_SIZE = 0 # than 0 to get more control over the balance between CPU load and processing # speed. At this moment only the input processing can be done using multiple # threads. Since this is still an experimental feature the default is set to 1, -# which efficively disables parallel processing. Please report any issues you +# which effectively disables parallel processing. Please report any issues you # encounter. Generating dot graphs in parallel is controlled by the # DOT_NUM_THREADS setting. # Minimum value: 0, maximum value: 32, default value: 1. diff --git a/third-party/mimalloc/doc/mimalloc-doc.h b/third-party/mimalloc/doc/mimalloc-doc.h index 3e75243b..d79eb2f8 100644 --- a/third-party/mimalloc/doc/mimalloc-doc.h +++ b/third-party/mimalloc/doc/mimalloc-doc.h @@ -168,7 +168,7 @@ void* mi_expand(void* p, size_t newsize); /// @returns A pointer to a block of \a count * \a size bytes, or \a NULL /// if out of memory or if \a count * \a size overflows. /// -/// If there is no overflow, it behaves exactly like `mi_malloc(p,count*size)`. +/// If there is no overflow, it behaves exactly like `mi_malloc(count*size)`. /// @see mi_calloc() /// @see mi_zallocn() void* mi_mallocn(size_t count, size_t size); @@ -441,7 +441,7 @@ bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_la /// @param pages The number of 1GiB pages to reserve. /// @param numa_nodes The number of nodes do evenly divide the pages over, or 0 for using the actual number of NUMA nodes. /// @param timeout_msecs Maximum number of milli-seconds to try reserving, or 0 for no timeout. -/// @returns 0 if successfull, \a ENOMEM if running out of memory, or \a ETIMEDOUT if timed out. +/// @returns 0 if successful, \a ENOMEM if running out of memory, or \a ETIMEDOUT if timed out. /// /// The reserved memory is used by mimalloc to satisfy allocations. /// May quit before \a timeout_msecs are expired if it estimates it will take more than @@ -455,7 +455,7 @@ int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t /// @param pages The number of 1GiB pages to reserve. /// @param numa_node The NUMA node where the memory is reserved (start at 0). /// @param timeout_msecs Maximum number of milli-seconds to try reserving, or 0 for no timeout. -/// @returns 0 if successfull, \a ENOMEM if running out of memory, or \a ETIMEDOUT if timed out. +/// @returns 0 if successful, \a ENOMEM if running out of memory, or \a ETIMEDOUT if timed out. /// /// The reserved memory is used by mimalloc to satisfy allocations. /// May quit before \a timeout_msecs are expired if it estimates it will take more than @@ -468,7 +468,7 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec /// Is the C runtime \a malloc API redirected? /// @returns \a true if all malloc API calls are redirected to mimalloc. /// -/// Currenty only used on Windows. +/// Currently only used on Windows. bool mi_is_redirected(); /// Return process information (time and memory usage). @@ -499,11 +499,11 @@ void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_m /// \{ /// The maximum supported alignment size (currently 1MiB). -#define MI_ALIGNMENT_MAX (1024*1024UL) +#define MI_BLOCK_ALIGNMENT_MAX (1024*1024UL) /// Allocate \a size bytes aligned by \a alignment. /// @param size number of bytes to allocate. -/// @param alignment the minimal alignment of the allocated memory. Must be less than #MI_ALIGNMENT_MAX. +/// @param alignment the minimal alignment of the allocated memory. Must be less than #MI_BLOCK_ALIGNMENT_MAX. /// @returns pointer to the allocated memory or \a NULL if out of memory. /// The returned pointer is aligned by \a alignment, i.e. /// `(uintptr_t)p % alignment == 0`. @@ -558,7 +558,7 @@ mi_heap_t* mi_heap_new(); /// Delete a previously allocated heap. /// This will release resources and migrate any -/// still allocated blocks in this heap (efficienty) +/// still allocated blocks in this heap (efficiently) /// to the default heap. /// /// If \a heap is the default heap, the default @@ -888,7 +888,7 @@ void mi_free_aligned(void* p, size_t alignment); /// /// Note: use the `mimalloc-new-delete.h` header to override the \a new /// and \a delete operators globally. The wrappers here are mostly -/// for convience for library writers that need to interface with +/// for convenience for library writers that need to interface with /// mimalloc from C++. /// /// \{ diff --git a/third-party/mimalloc/docker/alpine-arm32v7/Dockerfile b/third-party/mimalloc/docker/alpine-arm32v7/Dockerfile new file mode 100644 index 00000000..56f071db --- /dev/null +++ b/third-party/mimalloc/docker/alpine-arm32v7/Dockerfile @@ -0,0 +1,28 @@ +# install from an image +# download first an appropiate tar.gz image into the current directory +# from: +FROM scratch + +# Substitute the image name that was downloaded +ADD alpine-minirootfs-20240329-armv7.tar.gz / + +# Install tools +RUN apk add build-base make cmake +RUN apk add git +RUN apk add vim + +RUN mkdir -p /home/dev +WORKDIR /home/dev + +# Get mimalloc +RUN git clone https://github.com/microsoft/mimalloc -b dev-slice +RUN mkdir -p mimalloc/out/release +RUN mkdir -p mimalloc/out/debug + +# Build mimalloc debug +WORKDIR /home/dev/mimalloc/out/debug +RUN cmake ../.. -DMI_DEBUG_FULL=ON +RUN make -j +RUN make test + +CMD ["/bin/sh"] diff --git a/third-party/mimalloc/docker/alpine/Dockerfile b/third-party/mimalloc/docker/alpine/Dockerfile new file mode 100644 index 00000000..b222b791 --- /dev/null +++ b/third-party/mimalloc/docker/alpine/Dockerfile @@ -0,0 +1,23 @@ +# alpine image +FROM alpine + +# Install tools +RUN apk add build-base make cmake +RUN apk add git +RUN apk add vim + +RUN mkdir -p /home/dev +WORKDIR /home/dev + +# Get mimalloc +RUN git clone https://github.com/microsoft/mimalloc -b dev-slice +RUN mkdir -p mimalloc/out/release +RUN mkdir -p mimalloc/out/debug + +# Build mimalloc debug +WORKDIR /home/dev/mimalloc/out/debug +RUN cmake ../.. -DMI_DEBUG_FULL=ON +RUN make -j +RUN make test + +CMD ["/bin/sh"] \ No newline at end of file diff --git a/third-party/mimalloc/docker/manylinux-x64/Dockerfile b/third-party/mimalloc/docker/manylinux-x64/Dockerfile new file mode 100644 index 00000000..22d37e5a --- /dev/null +++ b/third-party/mimalloc/docker/manylinux-x64/Dockerfile @@ -0,0 +1,23 @@ +FROM quay.io/pypa/manylinux2014_x86_64 + +# Install tools +RUN yum install -y openssl-devel +RUN yum install -y gcc gcc-c++ kernel-devel make +RUN yum install -y git cmake +RUN yum install -y vim + +RUN mkdir -p /home/dev +WORKDIR /home/dev + +# Get mimalloc +RUN git clone https://github.com/microsoft/mimalloc -b dev-slice +RUN mkdir -p mimalloc/out/release +RUN mkdir -p mimalloc/out/debug + +# Build mimalloc debug +WORKDIR /home/dev/mimalloc/out/debug +RUN cmake ../.. -DMI_DEBUG_FULL=ON +RUN make -j +RUN make test + +CMD ["/bin/sh"] \ No newline at end of file diff --git a/third-party/mimalloc/docker/readme.md b/third-party/mimalloc/docker/readme.md new file mode 100644 index 00000000..b3d90094 --- /dev/null +++ b/third-party/mimalloc/docker/readme.md @@ -0,0 +1,10 @@ +Various example docker files used for testing. + +Usage: + +``` +> cd +> docker build -t -mimalloc . +> docker run -it -mimalloc +>> make test +``` diff --git a/third-party/mimalloc/ide/vs2017/mimalloc-override.vcxproj b/third-party/mimalloc/ide/vs2017/mimalloc-override.vcxproj index 3d5c1f75..6d20eb57 100644 --- a/third-party/mimalloc/ide/vs2017/mimalloc-override.vcxproj +++ b/third-party/mimalloc/ide/vs2017/mimalloc-override.vcxproj @@ -238,6 +238,7 @@ + diff --git a/third-party/mimalloc/ide/vs2017/mimalloc.vcxproj b/third-party/mimalloc/ide/vs2017/mimalloc.vcxproj index 46eb05d8..ece9a14d 100644 --- a/third-party/mimalloc/ide/vs2017/mimalloc.vcxproj +++ b/third-party/mimalloc/ide/vs2017/mimalloc.vcxproj @@ -227,6 +227,7 @@ + diff --git a/third-party/mimalloc/ide/vs2019/mimalloc-override.vcxproj b/third-party/mimalloc/ide/vs2019/mimalloc-override.vcxproj index 1c5c61b7..a84a5178 100644 --- a/third-party/mimalloc/ide/vs2019/mimalloc-override.vcxproj +++ b/third-party/mimalloc/ide/vs2019/mimalloc-override.vcxproj @@ -238,6 +238,7 @@ + diff --git a/third-party/mimalloc/ide/vs2019/mimalloc.vcxproj b/third-party/mimalloc/ide/vs2019/mimalloc.vcxproj index 0e2eb312..0076b1db 100644 --- a/third-party/mimalloc/ide/vs2019/mimalloc.vcxproj +++ b/third-party/mimalloc/ide/vs2019/mimalloc.vcxproj @@ -219,6 +219,7 @@ + true diff --git a/third-party/mimalloc/ide/vs2022/mimalloc-override.vcxproj b/third-party/mimalloc/ide/vs2022/mimalloc-override.vcxproj index e2c7f71d..df2a0816 100644 --- a/third-party/mimalloc/ide/vs2022/mimalloc-override.vcxproj +++ b/third-party/mimalloc/ide/vs2022/mimalloc-override.vcxproj @@ -240,6 +240,7 @@ + true diff --git a/third-party/mimalloc/ide/vs2022/mimalloc.vcxproj b/third-party/mimalloc/ide/vs2022/mimalloc.vcxproj index c298550a..33ad9cef 100644 --- a/third-party/mimalloc/ide/vs2022/mimalloc.vcxproj +++ b/third-party/mimalloc/ide/vs2022/mimalloc.vcxproj @@ -217,8 +217,15 @@ false + + true + true + true + true + + true diff --git a/third-party/mimalloc/include/mimalloc-override.h b/third-party/mimalloc/include/mimalloc-override.h index c63b0b91..48a8a622 100644 --- a/third-party/mimalloc/include/mimalloc-override.h +++ b/third-party/mimalloc/include/mimalloc-override.h @@ -24,7 +24,7 @@ not accidentally mix pointers from different allocators). #define free(p) mi_free(p) #define strdup(s) mi_strdup(s) -#define strndup(s,n) mi_strndup(s,n) +#define strndup(s,n) mi_strndup(s,n) #define realpath(f,n) mi_realpath(f,n) // Microsoft extensions @@ -43,6 +43,7 @@ not accidentally mix pointers from different allocators). #define reallocf(p,n) mi_reallocf(p,n) #define malloc_size(p) mi_usable_size(p) #define malloc_usable_size(p) mi_usable_size(p) +#define malloc_good_size(sz) mi_malloc_good_size(sz) #define cfree(p) mi_free(p) #define valloc(n) mi_valloc(n) diff --git a/third-party/mimalloc/include/mimalloc.h b/third-party/mimalloc/include/mimalloc.h index f77c2ea1..c41bcc80 100644 --- a/third-party/mimalloc/include/mimalloc.h +++ b/third-party/mimalloc/include/mimalloc.h @@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file #ifndef MIMALLOC_H #define MIMALLOC_H -#define MI_MALLOC_VERSION 212 // major + 2 digits minor +#define MI_MALLOC_VERSION 217 // major + 2 digits minor // ------------------------------------------------------ // Compiler specific attributes @@ -275,7 +275,7 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size mi_decl_export int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept; mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept; -mi_decl_export void mi_debug_show_arenas(void) mi_attr_noexcept; +mi_decl_export void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept; // Experimental: heaps associated with specific memory arena's typedef int mi_arena_id_t; @@ -318,40 +318,44 @@ mi_decl_export int mi_reserve_huge_os_pages(size_t pages, double max_secs, size typedef enum mi_option_e { // stable options - mi_option_show_errors, // print error messages - mi_option_show_stats, // print statistics on termination - mi_option_verbose, // print verbose messages - // the following options are experimental (see src/options.h) - mi_option_eager_commit, // eager commit segments? (after `eager_commit_delay` segments) (=1) - mi_option_arena_eager_commit, // eager commit arenas? Use 2 to enable just on overcommit systems (=2) - mi_option_purge_decommits, // should a memory purge decommit (or only reset) (=1) - mi_option_allow_large_os_pages, // allow large (2MiB) OS pages, implies eager commit - mi_option_reserve_huge_os_pages, // reserve N huge OS pages (1GiB/page) at startup - mi_option_reserve_huge_os_pages_at, // reserve huge OS pages at a specific NUMA node - mi_option_reserve_os_memory, // reserve specified amount of OS memory in an arena at startup + mi_option_show_errors, // print error messages + mi_option_show_stats, // print statistics on termination + mi_option_verbose, // print verbose messages + // advanced options + mi_option_eager_commit, // eager commit segments? (after `eager_commit_delay` segments) (=1) + mi_option_arena_eager_commit, // eager commit arenas? Use 2 to enable just on overcommit systems (=2) + mi_option_purge_decommits, // should a memory purge decommit? (=1). Set to 0 to use memory reset on a purge (instead of decommit) + mi_option_allow_large_os_pages, // allow large (2 or 4 MiB) OS pages, implies eager commit. If false, also disables THP for the process. + mi_option_reserve_huge_os_pages, // reserve N huge OS pages (1GiB pages) at startup + mi_option_reserve_huge_os_pages_at, // reserve huge OS pages at a specific NUMA node + mi_option_reserve_os_memory, // reserve specified amount of OS memory in an arena at startup (internally, this value is in KiB; use `mi_option_get_size`) mi_option_deprecated_segment_cache, mi_option_deprecated_page_reset, - mi_option_abandoned_page_purge, // immediately purge delayed purges on thread termination + mi_option_abandoned_page_purge, // immediately purge delayed purges on thread termination mi_option_deprecated_segment_reset, - mi_option_eager_commit_delay, - mi_option_purge_delay, // memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all. - mi_option_use_numa_nodes, // 0 = use all available numa nodes, otherwise use at most N nodes. - mi_option_limit_os_alloc, // 1 = do not use OS memory for allocation (but only programmatically reserved arenas) - mi_option_os_tag, // tag used for OS logging (macOS only for now) - mi_option_max_errors, // issue at most N error messages - mi_option_max_warnings, // issue at most N warning messages - mi_option_max_segment_reclaim, - mi_option_destroy_on_exit, // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe. - mi_option_arena_reserve, // initial memory size in KiB for arena reservation (1GiB on 64-bit) - mi_option_arena_purge_mult, + mi_option_eager_commit_delay, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) + mi_option_purge_delay, // memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all. (=10) + mi_option_use_numa_nodes, // 0 = use all available numa nodes, otherwise use at most N nodes. + mi_option_disallow_os_alloc, // 1 = do not use OS memory for allocation (but only programmatically reserved arenas) + mi_option_os_tag, // tag used for OS logging (macOS only for now) (=100) + mi_option_max_errors, // issue at most N error messages + mi_option_max_warnings, // issue at most N warning messages + mi_option_max_segment_reclaim, // max. percentage of the abandoned segments can be reclaimed per try (=10%) + mi_option_destroy_on_exit, // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe + mi_option_arena_reserve, // initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`) + mi_option_arena_purge_mult, // multiplier for `purge_delay` for the purging delay for arenas (=10) mi_option_purge_extend_delay, + mi_option_abandoned_reclaim_on_free, // allow to reclaim an abandoned segment on a free (=1) + mi_option_disallow_arena_alloc, // 1 = do not use arena's for allocation (except if using specific arena id's) + mi_option_retry_on_oom, // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows) _mi_option_last, // legacy option names mi_option_large_os_pages = mi_option_allow_large_os_pages, mi_option_eager_region_commit = mi_option_arena_eager_commit, mi_option_reset_decommits = mi_option_purge_decommits, mi_option_reset_delay = mi_option_purge_delay, - mi_option_abandoned_page_reset = mi_option_abandoned_page_purge + mi_option_abandoned_page_reset = mi_option_abandoned_page_purge, + mi_option_limit_os_alloc = mi_option_disallow_os_alloc } mi_option_t; @@ -494,7 +498,7 @@ template struct _mi_heap_stl_allocator_common : publi using typename _mi_stl_allocator_common::value_type; using typename _mi_stl_allocator_common::pointer; - _mi_heap_stl_allocator_common(mi_heap_t* hp) : heap(hp) { } /* will not delete nor destroy the passed in heap */ + _mi_heap_stl_allocator_common(mi_heap_t* hp) : heap(hp, [](mi_heap_t*) {}) {} /* will not delete nor destroy the passed in heap */ #if (__cplusplus >= 201703L) // C++17 mi_decl_nodiscard T* allocate(size_type count) { return static_cast(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(T))); } @@ -513,7 +517,7 @@ template struct _mi_heap_stl_allocator_common : publi protected: std::shared_ptr heap; template friend struct _mi_heap_stl_allocator_common; - + _mi_heap_stl_allocator_common() { mi_heap_t* hp = mi_heap_new(); this->heap.reset(hp, (_mi_destroy ? &heap_destroy : &heap_delete)); /* calls heap_delete/destroy when the refcount drops to zero */ @@ -530,7 +534,7 @@ template struct _mi_heap_stl_allocator_common : publi template struct mi_heap_stl_allocator : public _mi_heap_stl_allocator_common { using typename _mi_heap_stl_allocator_common::size_type; mi_heap_stl_allocator() : _mi_heap_stl_allocator_common() { } // creates fresh heap that is deleted when the destructor is called - mi_heap_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common(hp) { } // no delete nor destroy on the passed in heap + mi_heap_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common(hp) { } // no delete nor destroy on the passed in heap template mi_heap_stl_allocator(const mi_heap_stl_allocator& x) mi_attr_noexcept : _mi_heap_stl_allocator_common(x) { } mi_heap_stl_allocator select_on_container_copy_construction() const { return *this; } @@ -547,7 +551,7 @@ template bool operator!=(const mi_heap_stl_allocator& x, template struct mi_heap_destroy_stl_allocator : public _mi_heap_stl_allocator_common { using typename _mi_heap_stl_allocator_common::size_type; mi_heap_destroy_stl_allocator() : _mi_heap_stl_allocator_common() { } // creates fresh heap that is destroyed when the destructor is called - mi_heap_destroy_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common(hp) { } // no delete nor destroy on the passed in heap + mi_heap_destroy_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common(hp) { } // no delete nor destroy on the passed in heap template mi_heap_destroy_stl_allocator(const mi_heap_destroy_stl_allocator& x) mi_attr_noexcept : _mi_heap_stl_allocator_common(x) { } mi_heap_destroy_stl_allocator select_on_container_copy_construction() const { return *this; } diff --git a/third-party/mimalloc/include/mimalloc/atomic.h b/third-party/mimalloc/include/mimalloc/atomic.h index fe418fab..d5333dd9 100644 --- a/third-party/mimalloc/include/mimalloc/atomic.h +++ b/third-party/mimalloc/include/mimalloc/atomic.h @@ -23,8 +23,10 @@ terms of the MIT license. A copy of the license can be found in the file #define _Atomic(tp) std::atomic #define mi_atomic(name) std::atomic_##name #define mi_memory_order(name) std::memory_order_##name -#if !defined(ATOMIC_VAR_INIT) || (__cplusplus >= 202002L) // c++20, see issue #571 - #define MI_ATOMIC_VAR_INIT(x) x +#if (__cplusplus >= 202002L) // c++20, see issue #571 +#define MI_ATOMIC_VAR_INIT(x) x +#elif !defined(ATOMIC_VAR_INIT) +#define MI_ATOMIC_VAR_INIT(x) x #else #define MI_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x) #endif @@ -39,7 +41,9 @@ terms of the MIT license. A copy of the license can be found in the file #include #define mi_atomic(name) atomic_##name #define mi_memory_order(name) memory_order_##name -#if !defined(ATOMIC_VAR_INIT) || (__STDC_VERSION__ >= 201710L) // c17, see issue #735 +#if (__STDC_VERSION__ >= 201710L) // c17, see issue #735 + #define MI_ATOMIC_VAR_INIT(x) x +#elif !defined(ATOMIC_VAR_INIT) #define MI_ATOMIC_VAR_INIT(x) x #else #define MI_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x) @@ -128,8 +132,10 @@ static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) { #elif defined(_MSC_VER) -// MSVC C compilation wrapper that uses Interlocked operations to model C11 atomics. +// Legacy MSVC plain C compilation wrapper that uses Interlocked operations to model C11 atomics. +#ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN +#endif #include #include #ifdef _WIN64 @@ -195,7 +201,7 @@ static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_ #else uintptr_t x = *p; if (mo > mi_memory_order_relaxed) { - while (!mi_atomic_compare_exchange_weak_explicit(p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ }; + while (!mi_atomic_compare_exchange_weak_explicit((_Atomic(uintptr_t)*)p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ }; } return x; #endif @@ -323,7 +329,9 @@ static inline void mi_atomic_yield(void) { std::this_thread::yield(); } #elif defined(_WIN32) +#ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN +#endif #include static inline void mi_atomic_yield(void) { YieldProcessor(); diff --git a/third-party/mimalloc/include/mimalloc/internal.h b/third-party/mimalloc/include/mimalloc/internal.h index 00d26260..6c6e5ed0 100644 --- a/third-party/mimalloc/include/mimalloc/internal.h +++ b/third-party/mimalloc/include/mimalloc/internal.h @@ -14,8 +14,8 @@ terms of the MIT license. A copy of the license can be found in the file // functions and macros. // -------------------------------------------------------------------------- -#include "mimalloc/types.h" -#include "mimalloc/track.h" +#include "types.h" +#include "track.h" #if (MI_DEBUG>0) #define mi_trace_message(...) _mi_trace_message(__VA_ARGS__) @@ -30,14 +30,17 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_decl_noinline __declspec(noinline) #define mi_decl_thread __declspec(thread) #define mi_decl_cache_align __declspec(align(MI_CACHE_LINE)) +#define mi_decl_weak #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc #define mi_decl_noinline __attribute__((noinline)) #define mi_decl_thread __thread #define mi_decl_cache_align __attribute__((aligned(MI_CACHE_LINE))) +#define mi_decl_weak __attribute__((weak)) #else #define mi_decl_noinline #define mi_decl_thread __thread // hope for the best :-) #define mi_decl_cache_align +#define mi_decl_weak #endif #if defined(__EMSCRIPTEN__) && !defined(__wasi__) @@ -85,10 +88,11 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap void _mi_thread_done(mi_heap_t* heap); void _mi_thread_data_collect(void); +void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap); // os.c void _mi_os_init(void); // called from process init -void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats); +void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats); void _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats); void _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats); @@ -122,9 +126,21 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld); bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id); bool _mi_arena_contains(const void* p); -void _mi_arena_collect(bool force_purge, mi_stats_t* stats); +void _mi_arenas_collect(bool force_purge, mi_stats_t* stats); void _mi_arena_unsafe_destroy_all(mi_stats_t* stats); +bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment); +void _mi_arena_segment_mark_abandoned(mi_segment_t* segment); +size_t _mi_arena_segment_abandoned_count(void); + +typedef struct mi_arena_field_cursor_s { // abstract + mi_arena_id_t start; + int count; + size_t bitmap_idx; +} mi_arena_field_cursor_t; +void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current); +mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous); + // "segment-map.c" void _mi_segment_map_allocated_at(const mi_segment_t* segment); void _mi_segment_map_freed_at(const mi_segment_t* segment); @@ -134,7 +150,7 @@ mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t pag void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld); void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld); bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld); -void _mi_segment_thread_collect(mi_segments_tld_t* tld); +void _mi_segment_collect(mi_segment_t* segment, bool force, mi_segments_tld_t* tld); #if MI_HUGE_PAGE_ABANDON void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block); @@ -146,6 +162,7 @@ uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld); void _mi_abandoned_await_readers(void); void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld); +bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment); // "page.c" void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept mi_attr_malloc; @@ -170,11 +187,13 @@ size_t _mi_bin_size(uint8_t bin); // for stats uint8_t _mi_bin(size_t size); // for stats // "heap.c" +void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag); void _mi_heap_destroy_pages(mi_heap_t* heap); void _mi_heap_collect_abandon(mi_heap_t* heap); void _mi_heap_set_default_direct(mi_heap_t* heap); bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid); void _mi_heap_unsafe_destroy_all(void); +mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag); // "stats.c" void _mi_stats_done(mi_stats_t* stats); @@ -183,23 +202,28 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start); mi_msecs_t _mi_clock_start(void); // "alloc.c" -void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept; // called from `_mi_malloc_generic` +void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept; // called from `_mi_malloc_generic` +void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept; // called from `_mi_heap_malloc_aligned` +void* _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept; // called from `_mi_heap_malloc_aligned` void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept; void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept; // called from `_mi_heap_malloc_aligned` void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept; -mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p); +mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p); bool _mi_free_delayed_block(mi_block_t* block); -void _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept; // for runtime integration +void _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept; // for runtime integration void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size); -// option.c, c primitives +// "libc.c" +#include +void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args); +void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...); char _mi_toupper(char c); int _mi_strnicmp(const char* s, const char* t, size_t n); void _mi_strlcpy(char* dest, const char* src, size_t dest_size); void _mi_strlcat(char* dest, const char* src, size_t dest_size); size_t _mi_strlen(const char* s); size_t _mi_strnlen(const char* s, size_t max_len); - +bool _mi_getenv(const char* name, char* result, size_t result_size); #if MI_DEBUG>1 bool _mi_page_is_valid(mi_page_t* page); @@ -308,6 +332,17 @@ static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) { } } +// Align a pointer upwards +static inline void* mi_align_up_ptr(void* p, size_t alignment) { + return (void*)_mi_align_up((uintptr_t)p, alignment); +} + +// Align a pointer downwards +static inline void* mi_align_down_ptr(void* p, size_t alignment) { + return (void*)_mi_align_down((uintptr_t)p, alignment); +} + + // Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`. static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) { mi_assert_internal(divider != 0); @@ -347,10 +382,10 @@ static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) { } #else /* __builtin_umul_overflow is unavailable */ static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) { - #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t))) // sqrt(SIZE_MAX) + #define MI_MUL_COULD_OVERFLOW ((size_t)1 << (4*sizeof(size_t))) // sqrt(SIZE_MAX) *total = count * size; // note: gcc/clang optimize this to directly check the overflow flag - return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW) && size > 0 && (SIZE_MAX / size) < count); + return ((size >= MI_MUL_COULD_OVERFLOW || count >= MI_MUL_COULD_OVERFLOW) && size > 0 && (SIZE_MAX / size) < count); } #endif @@ -407,9 +442,14 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si // Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE), // and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it; // therefore we align one byte before `p`. +// We check for NULL afterwards on 64-bit systems to improve codegen for `mi_free`. static inline mi_segment_t* _mi_ptr_segment(const void* p) { - mi_assert_internal(p != NULL); - return (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK); + mi_segment_t* const segment = (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK); + #if MI_INTPTR_SIZE <= 4 + return (p==NULL ? NULL : segment); + #else + return ((intptr_t)segment <= 0 ? NULL : segment); + #endif } static inline mi_page_t* mi_slice_to_page(mi_slice_t* s) { @@ -424,7 +464,8 @@ static inline mi_slice_t* mi_page_to_slice(mi_page_t* p) { // Segment belonging to a page static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) { - mi_segment_t* segment = _mi_ptr_segment(page); + mi_assert_internal(page!=NULL); + mi_segment_t* segment = _mi_ptr_segment(page); mi_assert_internal(segment == NULL || ((mi_slice_t*)page >= segment->slices && (mi_slice_t*)page < segment->slices + segment->slice_entries)); return segment; } @@ -452,31 +493,28 @@ static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const } // Quick page start for initialized pages -static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) { - return _mi_segment_page_start(segment, page, page_size); +static inline uint8_t* mi_page_start(const mi_page_t* page) { + mi_assert_internal(page->page_start != NULL); + mi_assert_expensive(_mi_segment_page_start(_mi_page_segment(page),page,NULL) == page->page_start); + return page->page_start; } // Get the page containing the pointer static inline mi_page_t* _mi_ptr_page(void* p) { + mi_assert_internal(p!=NULL); return _mi_segment_page_of(_mi_ptr_segment(p), p); } // Get the block size of a page (special case for huge objects) static inline size_t mi_page_block_size(const mi_page_t* page) { - const size_t bsize = page->xblock_size; - mi_assert_internal(bsize > 0); - if mi_likely(bsize < MI_HUGE_BLOCK_SIZE) { - return bsize; - } - else { - size_t psize; - _mi_segment_page_start(_mi_page_segment(page), page, &psize); - return psize; - } + mi_assert_internal(page->block_size > 0); + return page->block_size; } static inline bool mi_page_is_huge(const mi_page_t* page) { - return (_mi_page_segment(page)->kind == MI_SEGMENT_HUGE); + mi_assert_internal((page->is_huge && _mi_page_segment(page)->kind == MI_SEGMENT_HUGE) || + (!page->is_huge && _mi_page_segment(page)->kind != MI_SEGMENT_HUGE)); + return page->is_huge; } // Get the usable block size of a page without fixed padding. @@ -511,6 +549,7 @@ static inline mi_heap_t* mi_page_heap(const mi_page_t* page) { static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING); mi_atomic_store_release(&page->xheap,(uintptr_t)heap); + if (heap != NULL) { page->heap_tag = heap->tag; } } // Thread free flag helpers @@ -726,12 +765,12 @@ size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx); #define mi_commit_mask_foreach(cm,idx,count) \ idx = 0; \ - while ((count = _mi_commit_mask_next_run(cm,&idx)) > 0) { - + while ((count = _mi_commit_mask_next_run(cm,&idx)) > 0) { + #define mi_commit_mask_foreach_end() \ idx += count; \ } - + /* ----------------------------------------------------------- diff --git a/third-party/mimalloc/include/mimalloc/prim.h b/third-party/mimalloc/include/mimalloc/prim.h index 9e560696..3f4574dd 100644 --- a/third-party/mimalloc/include/mimalloc/prim.h +++ b/third-party/mimalloc/include/mimalloc/prim.h @@ -14,19 +14,19 @@ terms of the MIT license. A copy of the license can be found in the file // Each OS/host needs to implement these primitives, see `src/prim` // for implementations on Window, macOS, WASI, and Linux/Unix. // -// note: on all primitive functions, we always have result parameters != NUL, and: +// note: on all primitive functions, we always have result parameters != NULL, and: // addr != NULL and page aligned // size > 0 and page aligned -// return value is an error code an int where 0 is success. +// the return value is an error code as an `int` where 0 is success // -------------------------------------------------------------------------- // OS memory configuration typedef struct mi_os_mem_config_s { - size_t page_size; // 4KiB - size_t large_page_size; // 2MiB - size_t alloc_granularity; // smallest allocation size (on Windows 64KiB) + size_t page_size; // default to 4KiB + size_t large_page_size; // 0 if not supported, usually 2MiB (4MiB on Windows) + size_t alloc_granularity; // smallest allocation size (usually 4KiB, on Windows 64KiB) bool has_overcommit; // can we reserve more memory than can be actually committed? - bool must_free_whole; // must allocated blocks be freed as a whole (false for mmap, true for VirtualAlloc) + bool has_partial_free; // can allocated blocks be freed partially? (true for mmap, false for VirtualAlloc) bool has_virtual_reserve; // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory) } mi_os_mem_config_t; @@ -35,10 +35,10 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ); // Free OS memory int _mi_prim_free(void* addr, size_t size ); - + // Allocate OS memory. Return NULL on error. // The `try_alignment` is just a hint and the returned pointer does not have to be aligned. -// If `commit` is false, the virtual memory range only needs to be reserved (with no access) +// If `commit` is false, the virtual memory range only needs to be reserved (with no access) // which will later be committed explicitly using `_mi_prim_commit`. // `is_zero` is set to true if the memory was zero initialized (as on most OS's) // pre: !commit => !allow_large @@ -82,11 +82,11 @@ mi_msecs_t _mi_prim_clock_now(void); typedef struct mi_process_info_s { mi_msecs_t elapsed; mi_msecs_t utime; - mi_msecs_t stime; - size_t current_rss; - size_t peak_rss; + mi_msecs_t stime; + size_t current_rss; + size_t peak_rss; size_t current_commit; - size_t peak_commit; + size_t peak_commit; size_t page_faults; } mi_process_info_t; @@ -117,7 +117,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap); //------------------------------------------------------------------- // Thread id: `_mi_prim_thread_id()` -// +// // Getting the thread id should be performant as it is called in the // fast path of `_mi_free` and we specialize for various platforms as // inlined definitions. Regular code should call `init.c:_mi_thread_id()`. @@ -125,33 +125,24 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap); // for each thread (unequal to zero). //------------------------------------------------------------------- -// defined in `init.c`; do not use these directly -extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from -extern bool _mi_process_is_initialized; // has mi_process_init been called? - -static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept; - -#if defined(_WIN32) - -#define WIN32_LEAN_AND_MEAN -#include -static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { - // Windows: works on Intel and ARM in both 32- and 64-bit - return (uintptr_t)NtCurrentTeb(); -} - -// We use assembly for a fast thread id on the main platforms. The TLS layout depends on -// both the OS and libc implementation so we use specific tests for each main platform. +// On some libc + platform combinations we can directly access a thread-local storage (TLS) slot. +// The TLS layout depends on both the OS and libc implementation so we use specific tests for each main platform. // If you test on another platform and it works please send a PR :-) // see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register. -#elif defined(__GNUC__) && ( \ +// +// Note: we would like to prefer `__builtin_thread_pointer()` nowadays instead of using assembly, +// but unfortunately we can not detect support reliably (see issue #883) +// We also use it on Apple OS as we use a TLS slot for the default heap there. +#if defined(__GNUC__) && ( \ (defined(__GLIBC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \ - || (defined(__APPLE__) && (defined(__x86_64__) || defined(__aarch64__))) \ + || (defined(__APPLE__) && (defined(__x86_64__) || defined(__aarch64__) || defined(__POWERPC__))) \ || (defined(__BIONIC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \ || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ ) +#define MI_HAS_TLS_SLOT + static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept { void* res; const size_t ofs = (slot*sizeof(void*)); @@ -175,6 +166,9 @@ static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept { __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); #endif res = tcb[slot]; + #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781 + MI_UNUSED(ofs); + res = pthread_getspecific(slot); #endif return res; } @@ -202,9 +196,63 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); #endif tcb[slot] = value; + #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781 + MI_UNUSED(ofs); + pthread_setspecific(slot, value); #endif } +#endif + +// Do we have __builtin_thread_pointer? This would be the preferred way to get a unique thread id +// but unfortunately, it seems we cannot test for this reliably at this time (see issue #883) +// Nevertheless, it seems needed on older graviton platforms (see issue #851). +// For now, we only enable this for specific platforms. +#if !defined(__APPLE__) /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly ()*/ \ + && !defined(MI_LIBC_MUSL) \ + && (!defined(__clang_major__) || __clang_major__ >= 14) /* older clang versions emit bad code; fall back to using the TLS slot () */ + #if (defined(__GNUC__) && (__GNUC__ >= 7) && defined(__aarch64__)) /* aarch64 for older gcc versions (issue #851) */ \ + || (defined(__GNUC__) && (__GNUC__ >= 11) && defined(__x86_64__)) \ + || (defined(__clang_major__) && (__clang_major__ >= 14) && (defined(__aarch64__) || defined(__x86_64__))) + #define MI_USE_BUILTIN_THREAD_POINTER 1 + #endif +#endif + + + +// defined in `init.c`; do not use these directly +extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from +extern bool _mi_process_is_initialized; // has mi_process_init been called? + +static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept; + +// Get a unique id for the current thread. +#if defined(MI_PRIM_THREAD_ID) + +static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { + return MI_PRIM_THREAD_ID(); // used for example by CPython for a free threaded build (see python/cpython#115488) +} + +#elif defined(_WIN32) + +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { + // Windows: works on Intel and ARM in both 32- and 64-bit + return (uintptr_t)NtCurrentTeb(); +} + +#elif MI_USE_BUILTIN_THREAD_POINTER + +static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { + // Works on most Unix based platforms with recent compilers + return (uintptr_t)__builtin_thread_pointer(); +} + +#elif defined(MI_HAS_TLS_SLOT) + static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { #if defined(__BIONIC__) // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id @@ -251,7 +299,6 @@ static inline mi_heap_t* mi_prim_get_default_heap(void); #if defined(MI_MALLOC_OVERRIDE) #if defined(__APPLE__) // macOS #define MI_TLS_SLOT 89 // seems unused? - // #define MI_TLS_RECURSE_GUARD 1 // other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89) // see #elif defined(__OpenBSD__) @@ -269,6 +316,9 @@ static inline mi_heap_t* mi_prim_get_default_heap(void); #if defined(MI_TLS_SLOT) +# if !defined(MI_HAS_TLS_SLOT) +# error "trying to use a TLS slot for the default heap, but the mi_prim_tls_slot primitives are not defined" +# endif static inline mi_heap_t* mi_prim_get_default_heap(void) { mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT); diff --git a/third-party/mimalloc/include/mimalloc/track.h b/third-party/mimalloc/include/mimalloc/track.h index 9545f750..a659d940 100644 --- a/third-party/mimalloc/include/mimalloc/track.h +++ b/third-party/mimalloc/include/mimalloc/track.h @@ -82,7 +82,9 @@ defined, undefined, or not accessible at all: #define MI_TRACK_HEAP_DESTROY 1 #define MI_TRACK_TOOL "ETW" +#ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN +#endif #include #include "../src/prim/windows/etw.h" diff --git a/third-party/mimalloc/include/mimalloc/types.h b/third-party/mimalloc/include/mimalloc/types.h index 2005238a..2fdde904 100644 --- a/third-party/mimalloc/include/mimalloc/types.h +++ b/third-party/mimalloc/include/mimalloc/types.h @@ -1,5 +1,5 @@ /* ---------------------------------------------------------------------------- -Copyright (c) 2018-2023, Microsoft Research, Daan Leijen +Copyright (c) 2018-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. @@ -13,15 +13,18 @@ terms of the MIT license. A copy of the license can be found in the file // mi_heap_t : all data for a thread-local heap, contains // lists of all managed heap pages. // mi_segment_t : a larger chunk of memory (32GiB) from where pages -// are allocated. -// mi_page_t : a mimalloc page (usually 64KiB or 512KiB) from +// are allocated. A segment is divided in slices (64KiB) from +// which pages are allocated. +// mi_page_t : a "mimalloc" page (usually 64KiB or 512KiB) from // where objects are allocated. +// Note: we write "OS page" for OS memory pages while +// using plain "page" for mimalloc pages (`mi_page_t`). // -------------------------------------------------------------------------- #include // ptrdiff_t #include // uintptr_t, uint16_t, etc -#include "mimalloc/atomic.h" // _Atomic +#include "atomic.h" // _Atomic #ifdef _MSC_VER #pragma warning(disable:4214) // bitfield is not int @@ -89,10 +92,11 @@ terms of the MIT license. A copy of the license can be found in the file #endif -// We used to abandon huge pages but to eagerly deallocate if freed from another thread, -// but that makes it not possible to visit them during a heap walk or include them in a -// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks if freed from -// another thread so most memory is available until it gets properly freed by the owning thread. +// We used to abandon huge pages in order to eagerly deallocate it if freed from another thread. +// Unfortunately, that makes it not possible to visit them during a heap walk or include them in a +// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks nowadays if freed from +// another thread so the memory becomes "virtually" available (and eventually gets properly freed by +// the owning thread). // #define MI_HUGE_PAGE_ABANDON 1 @@ -157,17 +161,24 @@ typedef int32_t mi_ssize_t; // Main tuning parameters for segment and page sizes // Sizes for 64-bit (usually divide by two for 32-bit) +#ifndef MI_SEGMENT_SLICE_SHIFT #define MI_SEGMENT_SLICE_SHIFT (13 + MI_INTPTR_SHIFT) // 64KiB (32KiB on 32-bit) +#endif +#ifndef MI_SEGMENT_SHIFT #if MI_INTPTR_SIZE > 4 #define MI_SEGMENT_SHIFT ( 9 + MI_SEGMENT_SLICE_SHIFT) // 32MiB #else #define MI_SEGMENT_SHIFT ( 7 + MI_SEGMENT_SLICE_SHIFT) // 4MiB on 32-bit #endif +#endif +#ifndef MI_SMALL_PAGE_SHIFT #define MI_SMALL_PAGE_SHIFT (MI_SEGMENT_SLICE_SHIFT) // 64KiB +#endif +#ifndef MI_MEDIUM_PAGE_SHIFT #define MI_MEDIUM_PAGE_SHIFT ( 3 + MI_SMALL_PAGE_SHIFT) // 512KiB - +#endif // Derived constants #define MI_SEGMENT_SIZE (MI_ZU(1)<> 1) -// blocks up to this size are always allocated aligned -#define MI_MAX_ALIGN_GUARANTEE (8*MI_MAX_ALIGN_SIZE) +// Maximum slice count (255) for which we can find the page for interior pointers +#define MI_MAX_SLICE_OFFSET_COUNT ((MI_BLOCK_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1) -// Alignments over MI_ALIGNMENT_MAX are allocated in dedicated huge page segments -#define MI_ALIGNMENT_MAX (MI_SEGMENT_SIZE >> 1) +// we never allocate more than PTRDIFF_MAX (see also ) +// on 64-bit+ systems we also limit the maximum allocation size such that the slice count fits in 32-bits. (issue #877) +#if (PTRDIFF_MAX > INT32_MAX) && (PTRDIFF_MAX >= (MI_SEGMENT_SLIZE_SIZE * UINT32_MAX)) +#define MI_MAX_ALLOC_SIZE (MI_SEGMENT_SLICE_SIZE * (UINT32_MAX-1)) +#else +#define MI_MAX_ALLOC_SIZE PTRDIFF_MAX +#endif // ------------------------------------------------------ @@ -227,7 +243,7 @@ typedef enum mi_delayed_e { MI_USE_DELAYED_FREE = 0, // push on the owning heap thread delayed list MI_DELAYED_FREEING = 1, // temporary: another thread is accessing the owning heap MI_NO_DELAYED_FREE = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list - MI_NEVER_DELAYED_FREE = 3 // sticky, only resets on page reclaim + MI_NEVER_DELAYED_FREE = 3 // sticky: used for abondoned pages without a owning heap; this only resets on page reclaim } mi_delayed_t; @@ -266,7 +282,6 @@ typedef uintptr_t mi_thread_free_t; // implement a monotonic heartbeat. The `thread_free` list is needed for // avoiding atomic operations in the common case. // -// // `used - |thread_free|` == actual blocks that are in use (alive) // `used - |thread_free| + |free| + |local_free| == capacity` // @@ -274,16 +289,13 @@ typedef uintptr_t mi_thread_free_t; // the number of memory accesses in the `mi_page_all_free` function(s). // // Notes: -// - Access is optimized for `mi_free` and `mi_page_alloc` (in `alloc.c`) +// - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc` // - Using `uint16_t` does not seem to slow things down -// - The size is 8 words on 64-bit which helps the page index calculations -// (and 10 words on 32-bit, and encoded free lists add 2 words. Sizes 10 -// and 12 are still good for address calculation) -// - To limit the structure size, the `xblock_size` is 32-bits only; for -// blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size -// - `thread_free` uses the bottom bits as a delayed-free flags to optimize +// - The size is 12 words on 64-bit which helps the page index calculations +// (and 14 words on 32-bit, and encoded free lists add 2 words) +// - `xthread_free` uses the bottom bits as a delayed-free flags to optimize // concurrent frees where only the first concurrent free adds to the owning -// heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`). +// heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`). // The invariant is that no-delayed-free is only set if there is // at least one block that will be added, or as already been added, to // the owning heap `thread_delayed_free` list. This guarantees that pages @@ -291,21 +303,26 @@ typedef uintptr_t mi_thread_free_t; typedef struct mi_page_s { // "owned" by the segment uint32_t slice_count; // slices in this page (0 if not a page) - uint32_t slice_offset; // distance from the actual page data slice (0 if a page) - uint8_t is_committed : 1; // `true` if the page virtual memory is committed - uint8_t is_zero_init : 1; // `true` if the page was initially zero initialized - + uint32_t slice_offset; // distance from the actual page data slice (0 if a page) + uint8_t is_committed:1; // `true` if the page virtual memory is committed + uint8_t is_zero_init:1; // `true` if the page was initially zero initialized + uint8_t is_huge:1; // `true` if the page is in a huge segment (`segment->kind == MI_SEGMENT_HUGE`) + // padding // layout like this to optimize access in `mi_malloc` and `mi_free` uint16_t capacity; // number of blocks committed, must be the first field, see `segment.c:page_clear` uint16_t reserved; // number of blocks reserved in memory mi_page_flags_t flags; // `in_full` and `has_aligned` flags (8 bits) - uint8_t free_is_zero : 1; // `true` if the blocks in the free list are zero initialized - uint8_t retire_expire : 7; // expiration count for retired blocks + uint8_t free_is_zero:1; // `true` if the blocks in the free list are zero initialized + uint8_t retire_expire:7; // expiration count for retired blocks mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) - uint32_t used; // number of blocks in use (including blocks in `local_free` and `thread_free`) - uint32_t xblock_size; // size available in each block (always `>0`) mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) + uint16_t used; // number of blocks in use (including blocks in `thread_free`) + uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`) + uint8_t heap_tag; // tag of the owning heap, used for separated heaps by object type + // padding + size_t block_size; // size available in each block (always `>0`) + uint8_t* page_start; // start of the page area containing the blocks #if (MI_ENCODE_FREELIST || MI_PADDING) uintptr_t keys[2]; // two random keys to encode the free lists (see `_mi_block_next`) or padding canary @@ -317,10 +334,8 @@ typedef struct mi_page_s { struct mi_page_s* next; // next page owned by this thread with the same `block_size` struct mi_page_s* prev; // previous page owned by this thread with the same `block_size` - // 64-bit 9 words, 32-bit 12 words, (+2 for secure) - #if MI_INTPTR_SIZE==8 - uintptr_t padding[1]; - #endif + // 64-bit 11 words, 32-bit 13 words, (+2 for secure) + void* padding[1]; } mi_page_t; @@ -331,21 +346,22 @@ typedef struct mi_page_s { typedef enum mi_page_kind_e { MI_PAGE_SMALL, // small blocks go into 64KiB pages inside a segment - MI_PAGE_MEDIUM, // medium blocks go into medium pages inside a segment - MI_PAGE_LARGE, // larger blocks go into a page of just one block - MI_PAGE_HUGE, // huge blocks (> 16 MiB) are put into a single page in a single segment. + MI_PAGE_MEDIUM, // medium blocks go into 512KiB pages inside a segment + MI_PAGE_LARGE, // larger blocks go into a single page spanning a whole segment + MI_PAGE_HUGE // a huge page is a single page in a segment of variable size + // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`. } mi_page_kind_t; typedef enum mi_segment_kind_e { MI_SEGMENT_NORMAL, // MI_SEGMENT_SIZE size with pages inside. - MI_SEGMENT_HUGE, // > MI_LARGE_SIZE_MAX segment with just one huge page inside. + MI_SEGMENT_HUGE, // segment with just one huge page inside. } mi_segment_kind_t; // ------------------------------------------------------ // A segment holds a commit mask where a bit is set if // the corresponding MI_COMMIT_SIZE area is committed. // The MI_COMMIT_SIZE must be a multiple of the slice -// size. If it is equal we have the most fine grained +// size. If it is equal we have the most fine grained // decommit (but setting it higher can be more efficient). // The MI_MINIMAL_COMMIT_SIZE is the minimal amount that will // be committed in one go which can be set higher than @@ -353,9 +369,9 @@ typedef enum mi_segment_kind_e { // is still tracked in fine-grained MI_COMMIT_SIZE chunks) // ------------------------------------------------------ -#define MI_MINIMAL_COMMIT_SIZE (1*MI_SEGMENT_SLICE_SIZE) +#define MI_MINIMAL_COMMIT_SIZE (1*MI_SEGMENT_SLICE_SIZE) #define MI_COMMIT_SIZE (MI_SEGMENT_SLICE_SIZE) // 64KiB -#define MI_COMMIT_MASK_BITS (MI_SEGMENT_SIZE / MI_COMMIT_SIZE) +#define MI_COMMIT_MASK_BITS (MI_SEGMENT_SIZE / MI_COMMIT_SIZE) #define MI_COMMIT_MASK_FIELD_BITS MI_SIZE_BITS #define MI_COMMIT_MASK_FIELD_COUNT (MI_COMMIT_MASK_BITS / MI_COMMIT_MASK_FIELD_BITS) @@ -371,13 +387,17 @@ typedef mi_page_t mi_slice_t; typedef int64_t mi_msecs_t; +// --------------------------------------------------------------- +// a memory id tracks the provenance of arena/OS allocated memory +// --------------------------------------------------------------- + // Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this. typedef enum mi_memkind_e { MI_MEM_NONE, // not allocated MI_MEM_EXTERNAL, // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example) MI_MEM_STATIC, // allocated in a static area and should not be freed (for arena meta data for example) MI_MEM_OS, // allocated from the OS - MI_MEM_OS_HUGE, // allocated as huge os pages + MI_MEM_OS_HUGE, // allocated as huge OS pages (usually 1GiB, pinned to physical memory) MI_MEM_OS_REMAP, // allocated in a remapable area (i.e. using `mremap`) MI_MEM_ARENA // allocated from an arena (the usual case) } mi_memkind_t; @@ -394,7 +414,7 @@ typedef struct mi_memid_os_info { typedef struct mi_memid_arena_info { size_t block_index; // index in the arena mi_arena_id_t id; // arena id (>= 1) - bool is_exclusive; // the arena can only be used for specific arena allocations + bool is_exclusive; // this arena can only be used for specific arena allocations } mi_memid_arena_info_t; typedef struct mi_memid_s { @@ -402,47 +422,56 @@ typedef struct mi_memid_s { mi_memid_os_info_t os; // only used for MI_MEM_OS mi_memid_arena_info_t arena; // only used for MI_MEM_ARENA } mem; - bool is_pinned; // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large OS pages) + bool is_pinned; // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages) bool initially_committed;// `true` if the memory was originally allocated as committed bool initially_zero; // `true` if the memory was originally zero initialized mi_memkind_t memkind; } mi_memid_t; -// Segments are large allocated memory blocks (8mb on 64 bit) from -// the OS. Inside segments we allocated fixed size _pages_ that -// contain blocks. +// ----------------------------------------------------------------------------------------- +// Segments are large allocated memory blocks (8mb on 64 bit) from arenas or the OS. +// +// Inside segments we allocated fixed size mimalloc pages (`mi_page_t`) that contain blocks. +// The start of a segment is this structure with a fixed number of slice entries (`slices`) +// usually followed by a guard OS page and the actual allocation area with pages. +// While a page is not allocated, we view it's data as a `mi_slice_t` (instead of a `mi_page_t`). +// Of any free area, the first slice has the info and `slice_offset == 0`; for any subsequent +// slices part of the area, the `slice_offset` is the byte offset back to the first slice +// (so we can quickly find the page info on a free, `internal.h:_mi_segment_page_of`). +// For slices, the `block_size` field is repurposed to signify if a slice is used (`1`) or not (`0`). +// Small and medium pages use a fixed amount of slices to reduce slice fragmentation, while +// large and huge pages span a variable amount of slices. typedef struct mi_segment_s { // constant fields - mi_memid_t memid; // memory id for arena allocation - bool allow_decommit; - bool allow_purge; + mi_memid_t memid; // memory id for arena/OS allocation + bool allow_decommit; // can we decommmit the memory + bool allow_purge; // can we purge the memory (reset or decommit) size_t segment_size; // segment fields - mi_msecs_t purge_expire; - mi_commit_mask_t purge_mask; - mi_commit_mask_t commit_mask; - - _Atomic(struct mi_segment_s*) abandoned_next; + mi_msecs_t purge_expire; // purge slices in the `purge_mask` after this time + mi_commit_mask_t purge_mask; // slices that can be purged + mi_commit_mask_t commit_mask; // slices that are currently committed // from here is zero initialized struct mi_segment_s* next; // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`) - + bool was_reclaimed; // true if it was reclaimed (used to limit on-free reclamation) + size_t abandoned; // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`) - size_t abandoned_visits; // count how often this segment is visited in the abandoned list (to force reclaim it it is too long) + size_t abandoned_visits; // count how often this segment is visited during abondoned reclamation (to force reclaim if it takes too long) size_t used; // count of pages in use - uintptr_t cookie; // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie` + uintptr_t cookie; // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie` size_t segment_slices; // for huge segments this may be different from `MI_SLICES_PER_SEGMENT` - size_t segment_info_slices; // initial slices we are using segment info and possible guard pages. + size_t segment_info_slices; // initial count of slices that we are using for segment info and possible guard pages. // layout like this to optimize access in `mi_free` mi_segment_kind_t kind; size_t slice_entries; // entries in the `slices` array, at most `MI_SLICES_PER_SEGMENT` _Atomic(mi_threadid_t) thread_id; // unique id of the thread owning this segment - mi_slice_t slices[MI_SLICES_PER_SEGMENT+1]; // one more for huge blocks with large alignment + mi_slice_t slices[MI_SLICES_PER_SEGMENT+1]; // one extra final entry for huge blocks with large alignment } mi_segment_t; @@ -499,11 +528,9 @@ typedef struct mi_padding_s { // A heap owns a set of pages. struct mi_heap_s { mi_tld_t* tld; - mi_page_t* pages_free_direct[MI_PAGES_DIRECT]; // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size. - mi_page_queue_t pages[MI_BIN_FULL + 1]; // queue of pages for each size class (or "bin") _Atomic(mi_block_t*) thread_delayed_free; mi_threadid_t thread_id; // thread this heap belongs too - mi_arena_id_t arena_id; // arena id if the heap belongs to a specific arena (or 0) + mi_arena_id_t arena_id; // arena id if the heap belongs to a specific arena (or 0) uintptr_t cookie; // random cookie to verify pointers (see `_mi_ptr_cookie`) uintptr_t keys[2]; // two random keys used to encode the `thread_delayed_free` list mi_random_ctx_t random; // random number context used for secure allocation @@ -512,6 +539,9 @@ struct mi_heap_s { size_t page_retired_max; // largest retired index into the `pages` array. mi_heap_t* next; // list of heaps per thread bool no_reclaim; // `true` if this heap should not reclaim abandoned pages + uint8_t tag; // custom tag, can be used for separating heaps based on the object types + mi_page_t* pages_free_direct[MI_PAGES_DIRECT]; // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size. + mi_page_queue_t pages[MI_BIN_FULL + 1]; // queue of pages for each size class (or "bin") }; @@ -600,6 +630,9 @@ typedef struct mi_stats_s { mi_stat_counter_t normal_count; mi_stat_counter_t huge_count; mi_stat_counter_t large_count; + mi_stat_counter_t arena_count; + mi_stat_counter_t arena_crossover_count; + mi_stat_counter_t arena_rollback_count; #if MI_STAT>1 mi_stat_count_t normal_bins[MI_BIN_HUGE+1]; #endif @@ -624,6 +657,7 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount); #define mi_heap_stat_increase(heap,stat,amount) mi_stat_increase( (heap)->tld->stats.stat, amount) #define mi_heap_stat_decrease(heap,stat,amount) mi_stat_decrease( (heap)->tld->stats.stat, amount) + // ------------------------------------------------------ // Thread Local data // ------------------------------------------------------ @@ -652,6 +686,7 @@ typedef struct mi_segments_tld_s { size_t peak_count; // peak number of segments size_t current_size; // current size of all segments size_t peak_size; // peak size of all segments + size_t reclaim_count;// number of reclaimed (abandoned) segments mi_stats_t* stats; // points to tld stats mi_os_tld_t* os; // points to os stats } mi_segments_tld_t; diff --git a/third-party/mimalloc/readme.md b/third-party/mimalloc/readme.md index ecab8131..a0296b43 100644 --- a/third-party/mimalloc/readme.md +++ b/third-party/mimalloc/readme.md @@ -12,8 +12,8 @@ is a general purpose allocator with excellent [performance](#performance) charac Initially developed by Daan Leijen for the runtime systems of the [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages. -Latest release tag: `v2.1.2` (2023-04-24). -Latest stable tag: `v1.8.2` (2023-04-24). +Latest release tag: `v2.1.7` (2024-05-21). +Latest v1 tag: `v1.8.7` (2024-05-21). mimalloc is a drop-in replacement for `malloc` and can be used in other programs without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as: @@ -29,6 +29,8 @@ It also includes a robust way to override the default allocator in [Windows](#ov bounded worst-case times with reference counting). Partly due to its simplicity, mimalloc has been ported to many systems (Windows, macOS, Linux, WASM, various BSD's, Haiku, MUSL, etc) and has excellent support for dynamic overriding. + At the same time, it is an industrial strength allocator that runs (very) large scale + distributed services on thousands of machines with excellent worst case latencies. - __free list sharding__: instead of one big free list (per size class) we have many smaller lists per "mimalloc page" which reduces fragmentation and increases locality -- @@ -70,14 +72,29 @@ Enjoy! * `master`: latest stable release (based on `dev-slice`). * `dev`: development branch for mimalloc v1. Use this branch for submitting PR's. -* `dev-slice`: development branch for mimalloc v2. This branch is downstream of `dev`. +* `dev-slice`: development branch for mimalloc v2. This branch is downstream of `dev` (and is essentially equal to `dev` except for +`src/segment.c`) ### Releases -Note: the `v2.x` version has a new algorithm for managing internal mimalloc pages that tends to use reduce memory usage +Note: the `v2.x` version has a different algorithm for managing internal mimalloc pages (as slices) that tends to use reduce +memory usage and fragmentation compared to mimalloc `v1.x` (especially for large workloads). Should otherwise have similar performance (see [below](#performance)); please report if you observe any significant performance regression. +* 2024-05-21, `v1.8.7`, `v2.1.7`: Fix build issues on less common platforms. Started upstreaming patches + from the CPython [integration](https://github.com/python/cpython/issues/113141#issuecomment-2119255217). Upstream `vcpkg` patches. +* 2024-05-13, `v1.8.6`, `v2.1.6`: Fix build errors on various (older) platforms. Refactored aligned allocation. +* 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Add `MI_LIBC_MUSL` cmake flag for musl builds. + Free-ing code is refactored into a separate module (`free.c`). Mimalloc page info is simplified with the block size + directly available (and new `block_size_shift` to improve aligned block free-ing). + New approach to collection of abandoned segments: When + a thread terminates the segments it owns are abandoned (containing still live objects) and these can be + reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's + which is more concurrent (and more aggressive). Abandoned memory can now also be reclaimed if a thread frees an object in + an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`). The option `mi_option_max_segment_reclaim` + gives a maximum percentage of abandoned segments that can be reclaimed per try (=10%). + * 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking. @@ -89,7 +106,7 @@ Note: the `v2.x` version has a new algorithm for managing internal mimalloc page abstraction layer to make it easier to port and separate platform dependent code (in `src/prim`). Fixed C++ STL compilation on older Microsoft C++ compilers, and various small bug fixes. * 2022-12-23, `v1.7.9`, `v2.0.9`: Supports building with [asan](#asan) and improved [Valgrind](#valgrind) support. - Support abitrary large alignments (in particular for `std::pmr` pools). + Support arbitrary large alignments (in particular for `std::pmr` pools). Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev). Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho). Various small bug fixes. @@ -142,7 +159,7 @@ mimalloc is used in various large scale low-latency services and programs, for e ## Windows -Open `ide/vs2019/mimalloc.sln` in Visual Studio 2019 and build. +Open `ide/vs2022/mimalloc.sln` in Visual Studio 2022 and build. The `mimalloc` project builds a static library (in `out/msvc-x64`), while the `mimalloc-override` project builds a DLL for overriding malloc in the entire program. @@ -222,7 +239,7 @@ target_link_libraries(myapp PUBLIC mimalloc-static) to link with the static library. See `test\CMakeLists.txt` for an example. For best performance in C++ programs, it is also recommended to override the -global `new` and `delete` operators. For convience, mimalloc provides +global `new` and `delete` operators. For convenience, mimalloc provides [`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project. In C++, mimalloc also provides the `mi_stl_allocator` struct which implements the `std::allocator` interface. @@ -278,17 +295,23 @@ You can set further options either programmatically (using [`mi_option_set`](htt Advanced options: +- `MIMALLOC_ARENA_EAGER_COMMIT=2`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc + allocates segments and pages. Set this to 2 (default) to + only enable this on overcommit systems (e.g. Linux). Set this to 1 to enable explicitly on other systems + as well (like Windows or macOS) which may improve performance (as the whole arena is committed at once). + Note that eager commit only increases the commit but not the actual the peak resident set + (rss) so it is generally ok to enable this. - `MIMALLOC_PURGE_DELAY=N`: the delay in `N` milli-seconds (by default `10`) after which mimalloc will purge OS pages that are not in use. This signals to the OS that the underlying physical memory can be reused which can reduce memory fragmentation especially in long running (server) programs. Setting `N` to `0` purges immediately when a page becomes unused which can improve memory usage but also decreases performance. Setting `N` to a higher value like `100` can improve performance (sometimes by a lot) at the cost of potentially using more memory at times. - Setting it to `-1` disables purging completely. -- `MIMALLOC_ARENA_EAGER_COMMIT=1`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc - allocates segments and pages. This is by default - only enabled on overcommit systems (e.g. Linux) but enabling it explicitly on other systems (like Windows or macOS) - may improve performance. Note that eager commit only increases the commit but not the actual the peak resident set - (rss) so it is generally ok to enable this. + Setting it to `-1` disables purging completely. +- `MIMALLOC_PURGE_DECOMMITS=1`: By default "purging" memory means unused memory is decommitted (`MEM_DECOMMIT` on Windows, + `MADV_DONTNEED` (which decresease rss immediately) on `mmap` systems). Set this to 0 to instead "reset" unused + memory on a purge (`MEM_RESET` on Windows, generally `MADV_FREE` (which does not decrease rss immediately) on `mmap` systems). + Mimalloc generally does not "free" OS memory but only "purges" OS memory, in other words, it tries to keep virtual + address ranges and decommits within those ranges (to make the underlying physical memory available to other processes). Further options for large workloads and services: @@ -296,9 +319,10 @@ Further options for large workloads and services: at runtime. Setting `N` to 1 may avoid problems in some virtual environments. Also, setting it to a lower number than the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed). -- `MIMALLOC_ALLOW_LARGE_OS_PAGES=1`: use large OS pages (2MiB) when available; for some workloads this can significantly - improve performance. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs - to explicitly allow large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes +- `MIMALLOC_ALLOW_LARGE_OS_PAGES=1`: use large OS pages (2 or 4MiB) when available; for some workloads this can significantly + improve performance. When this option is disabled, it also disables transparent huge pages (THP) for the process + (on Linux and Android). Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs + to explicitly give permissions for large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible). - `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where `N` is the number of 1GiB _huge_ OS pages. This reserves the huge pages at @@ -307,11 +331,12 @@ Further options for large workloads and services: OS pages, use with care as reserving contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at startup only once). - Note that we usually need to explicitly enable huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])). + Note that we usually need to explicitly give permission for huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])). With huge OS pages, it may be beneficial to set the setting `MIMALLOC_EAGER_COMMIT_DELAY=N` (`N` is 1 by default) to delay the initial `N` segments (of 4MiB) of a thread to not allocate in the huge OS pages; this prevents threads that are short lived - and allocate just a little to take up space in the huge OS page area (which cannot be purged). + and allocate just a little to take up space in the huge OS page area (which cannot be purged as huge OS pages are pinned + to physical memory). The huge pages are usually allocated evenly among NUMA nodes. We can use `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N` where `N` is the numa node (starting at 0) to allocate all the huge pages at a specific numa node instead. @@ -392,32 +417,41 @@ the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-i ### Dynamic Override on Windows -Overriding on Windows is robust and has the -particular advantage to be able to redirect all malloc/free calls that go through +Dynamically overriding on mimalloc on Windows +is robust and has the particular advantage to be able to redirect all malloc/free calls that go through the (dynamic) C runtime allocator, including those from other DLL's or libraries. - -The overriding on Windows requires that you link your program explicitly with -the mimalloc DLL and use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch). -Also, the `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) must be put -in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency). -The redirection DLL ensures that all calls to the C runtime malloc API get redirected to -mimalloc (in `mimalloc-override.dll`). - -To ensure the mimalloc DLL is loaded at run-time it is easiest to insert some -call to the mimalloc API in the `main` function, like `mi_version()` -(or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project -for an example on how to use this. For best performance on Windows with C++, it +As it intercepts all allocation calls on a low level, it can be used reliably +on large programs that include other 3rd party components. +There are four requirements to make the overriding work robustly: + +1. Use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch). +2. Link your program explicitly with `mimalloc-override.dll` library. + To ensure the `mimalloc-override.dll` is loaded at run-time it is easiest to insert some + call to the mimalloc API in the `main` function, like `mi_version()` + (or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project + for an example on how to use this. +3. The [`mimalloc-redirect.dll`](bin) (or `mimalloc-redirect32.dll`) must be put + in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency of that DLL). + The redirection DLL ensures that all calls to the C runtime malloc API get redirected to + mimalloc functions (which reside in `mimalloc-override.dll`). +4. Ensure the `mimalloc-override.dll` comes as early as possible in the import + list of the final executable (so it can intercept all potential allocations). + +For best performance on Windows with C++, it is also recommended to also override the `new`/`delete` operations (by including -[`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) a single(!) source file in your project). +[`mimalloc-new-delete.h`](include/mimalloc-new-delete.h) +a single(!) source file in your project). The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected. -(Note: in principle, it is possible to even patch existing executables without any recompilation +We cannot always re-link an executable with `mimalloc-override.dll`, and similarly, we cannot always +ensure the the DLL comes first in the import table of the final executable. +In many cases though we can patch existing executables without any recompilation if they are linked with the dynamic C runtime (`ucrtbase.dll`) -- just put the `mimalloc-override.dll` into the import table (and put `mimalloc-redirect.dll` in the same folder) -Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388)). - +Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388) or +the [`minject`](bin) program. ## Static override @@ -439,7 +473,7 @@ This is provided by [`mimalloc-override.h`](https://github.com/microsoft/mimallo under your control or otherwise mixing of pointers from different heaps may occur! -## Tools +# Tools Generally, we recommend using the standard allocator with memory tracking tools, but mimalloc can also be build to support the [address sanitizer][asan] or the excellent [Valgrind] tool. @@ -447,7 +481,7 @@ Moreover, it can be build to support Windows event tracing ([ETW]). This has a small performance overhead but does allow detecting memory leaks and byte-precise buffer overflows directly on final executables. See also the `test/test-wrong.c` file to test with various tools. -### Valgrind +## Valgrind To build with [valgrind] support, use the `MI_TRACK_VALGRIND=ON` cmake option: @@ -481,7 +515,7 @@ Valgrind support is in its initial development -- please report any issues. [Valgrind]: https://valgrind.org/ [valgrind-soname]: https://valgrind.org/docs/manual/manual-core.html#opt.soname-synonyms -### ASAN +## ASAN To build with the address sanitizer, use the `-DMI_TRACK_ASAN=ON` cmake option: @@ -510,7 +544,7 @@ Adress sanitizer support is in its initial development -- please report any issu [asan]: https://github.com/google/sanitizers/wiki/AddressSanitizer -### ETW +## ETW Event tracing for Windows ([ETW]) provides a high performance way to capture all allocations though mimalloc and analyze them later. To build with ETW support, use the `-DMI_TRACK_ETW=ON` cmake option. diff --git a/third-party/mimalloc/src/alloc-aligned.c b/third-party/mimalloc/src/alloc-aligned.c index 1cd809f1..ba629ef3 100644 --- a/third-party/mimalloc/src/alloc-aligned.c +++ b/third-party/mimalloc/src/alloc-aligned.c @@ -15,25 +15,24 @@ terms of the MIT license. A copy of the license can be found in the file // Aligned Allocation // ------------------------------------------------------ -// Fallback primitive aligned allocation -- split out for better codegen -static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept +static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) { + // objects up to `MI_MAX_ALIGN_GUARANTEE` are allocated aligned to their size (see `segment.c:_mi_segment_page_start`). + mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0)); + if (alignment > size) return false; + if (alignment <= MI_MAX_ALIGN_SIZE) return true; + const size_t bsize = mi_good_size(size); + return (bsize <= MI_MAX_ALIGN_GUARANTEE && (bsize & (alignment-1)) == 0); +} + +// Fallback aligned allocation that over-allocates -- split out for better codegen +static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept { - mi_assert_internal(size <= PTRDIFF_MAX); + mi_assert_internal(size <= (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)); mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment)); - const uintptr_t align_mask = alignment - 1; // for any x, `(x & align_mask) == (x % alignment)` - const size_t padsize = size + MI_PADDING_SIZE; - - // use regular allocation if it is guaranteed to fit the alignment constraints - if (offset==0 && alignment<=padsize && padsize<=MI_MAX_ALIGN_GUARANTEE && (padsize&align_mask)==0) { - void* p = _mi_heap_malloc_zero(heap, size, zero); - mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0); - return p; - } - void* p; size_t oversize; - if mi_unlikely(alignment > MI_ALIGNMENT_MAX) { + if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) { // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page) // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down) @@ -47,7 +46,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size); p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block // zero afterwards as only the area from the aligned_p may be committed! - if (p == NULL) return NULL; + if (p == NULL) return NULL; } else { // otherwise over-allocate @@ -57,6 +56,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* } // .. and align within the allocation + const uintptr_t align_mask = alignment - 1; // for any x, `(x & align_mask) == (x % alignment)` const uintptr_t poffset = ((uintptr_t)p + offset) & align_mask; const uintptr_t adjust = (poffset == 0 ? 0 : alignment - poffset); mi_assert_internal(adjust < alignment); @@ -69,14 +69,14 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* // todo: expand padding if overallocated ? mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size); - mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p), _mi_ptr_page(aligned_p), aligned_p)); + mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_page(aligned_p), aligned_p)); mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0); mi_assert_internal(mi_usable_size(aligned_p)>=size); mi_assert_internal(mi_usable_size(p) == mi_usable_size(aligned_p)+adjust); - + // now zero the block if needed - if (alignment > MI_ALIGNMENT_MAX) { - // for the tracker, on huge aligned allocations only from the start of the large block is defined + if (alignment > MI_BLOCK_ALIGNMENT_MAX) { + // for the tracker, on huge aligned allocations only the memory from the start of the large block is defined mi_track_mem_undefined(aligned_p, size); if (zero) { _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p)); @@ -85,48 +85,77 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* if (p != aligned_p) { mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p)); - } + } return aligned_p; } -// Primitive aligned allocation -static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept +// Generic primitive aligned allocation -- split out for better codegen +static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_generic(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept { - // note: we don't require `size > offset`, we just guarantee that the address at offset is aligned regardless of the allocated size. - if mi_unlikely(alignment == 0 || !_mi_is_power_of_two(alignment)) { // require power-of-two (see ) + mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment)); + // we don't allocate more than MI_MAX_ALLOC_SIZE (see ) + if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) { #if MI_DEBUG > 0 - _mi_error_message(EOVERFLOW, "aligned allocation requires the alignment to be a power-of-two (size %zu, alignment %zu)\n", size, alignment); + _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment); #endif return NULL; } + + // use regular allocation if it is guaranteed to fit the alignment constraints. + // this is important to try as the fast path in `mi_heap_malloc_zero_aligned` only works when there exist + // a page with the right block size, and if we always use the over-alloc fallback that would never happen. + if (offset == 0 && mi_malloc_is_naturally_aligned(size,alignment)) { + void* p = _mi_heap_malloc_zero(heap, size, zero); + mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0); + const bool is_aligned_or_null = (((uintptr_t)p) & (alignment-1))==0; + if mi_likely(is_aligned_or_null) { + return p; + } + else { + // this should never happen if the `mi_malloc_is_naturally_aligned` check is correct.. + mi_assert(false); + mi_free(p); + } + } + + // fall back to over-allocation + return mi_heap_malloc_zero_aligned_at_overalloc(heap,size,alignment,offset,zero); +} - if mi_unlikely(size > PTRDIFF_MAX) { // we don't allocate more than PTRDIFF_MAX (see ) +// Primitive aligned allocation +static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept +{ + // note: we don't require `size > offset`, we just guarantee that the address at offset is aligned regardless of the allocated size. + if mi_unlikely(alignment == 0 || !_mi_is_power_of_two(alignment)) { // require power-of-two (see ) #if MI_DEBUG > 0 - _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment); + _mi_error_message(EOVERFLOW, "aligned allocation requires the alignment to be a power-of-two (size %zu, alignment %zu)\n", size, alignment); #endif return NULL; } - const uintptr_t align_mask = alignment-1; // for any x, `(x & align_mask) == (x % alignment)` - const size_t padsize = size + MI_PADDING_SIZE; // note: cannot overflow due to earlier size > PTRDIFF_MAX check - + // try first if there happens to be a small block available with just the right alignment - if mi_likely(padsize <= MI_SMALL_SIZE_MAX && alignment <= padsize) { + if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) { + const uintptr_t align_mask = alignment-1; // for any x, `(x & align_mask) == (x % alignment)` + const size_t padsize = size + MI_PADDING_SIZE; mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize); - const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0; - if mi_likely(page->free != NULL && is_aligned) - { - #if MI_STAT>1 - mi_heap_stat_increase(heap, malloc, size); - #endif - void* p = _mi_page_malloc(heap, page, padsize, zero); // TODO: inline _mi_page_malloc - mi_assert_internal(p != NULL); - mi_assert_internal(((uintptr_t)p + offset) % alignment == 0); - mi_track_malloc(p,size,zero); - return p; + if mi_likely(page->free != NULL) { + const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0; + if mi_likely(is_aligned) + { + #if MI_STAT>1 + mi_heap_stat_increase(heap, malloc, size); + #endif + void* p = (zero ? _mi_page_malloc_zeroed(heap,page,padsize) : _mi_page_malloc(heap,page,padsize)); // call specific page malloc for better codegen + mi_assert_internal(p != NULL); + mi_assert_internal(((uintptr_t)p + offset) % alignment == 0); + mi_track_malloc(p,size,zero); + return p; + } } } - // fallback - return mi_heap_malloc_zero_aligned_at_fallback(heap, size, alignment, offset, zero); + + // fallback to generic aligned allocation + return mi_heap_malloc_zero_aligned_at_generic(heap, size, alignment, offset, zero); } @@ -139,27 +168,12 @@ mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* he } mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept { - if mi_unlikely(alignment == 0 || !_mi_is_power_of_two(alignment)) return NULL; - #if !MI_PADDING - // without padding, any small sized allocation is naturally aligned (see also `_mi_segment_page_start`) - if mi_likely(_mi_is_power_of_two(size) && size >= alignment && size <= MI_SMALL_SIZE_MAX) - #else - // with padding, we can only guarantee this for fixed alignments - if mi_likely((alignment == sizeof(void*) || (alignment == MI_MAX_ALIGN_SIZE && size > (MI_MAX_ALIGN_SIZE/2))) - && size <= MI_SMALL_SIZE_MAX) - #endif - { - // fast path for common alignment and size - return mi_heap_malloc_small(heap, size); - } - else { - return mi_heap_malloc_aligned_at(heap, size, alignment, 0); - } + return mi_heap_malloc_aligned_at(heap, size, alignment, 0); } // ensure a definition is emitted #if defined(__cplusplus) -static void* _mi_heap_malloc_aligned = (void*)&mi_heap_malloc_aligned; +void* _mi_extern_heap_malloc_aligned = (void*)&mi_heap_malloc_aligned; #endif // ------------------------------------------------------ diff --git a/third-party/mimalloc/src/alloc-override.c b/third-party/mimalloc/src/alloc-override.c index 873065dc..12837cdd 100644 --- a/third-party/mimalloc/src/alloc-override.c +++ b/third-party/mimalloc/src/alloc-override.c @@ -23,7 +23,7 @@ mi_decl_externc size_t malloc_good_size(size_t size); #endif // helper definition for C override of C++ new -typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t; +typedef void* mi_nothrow_t; // ------------------------------------------------------ // Override system malloc @@ -77,7 +77,9 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t; MI_INTERPOSE_MI(calloc), MI_INTERPOSE_MI(realloc), MI_INTERPOSE_MI(strdup), + #if defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7 MI_INTERPOSE_MI(strndup), + #endif MI_INTERPOSE_MI(realpath), MI_INTERPOSE_MI(posix_memalign), MI_INTERPOSE_MI(reallocf), @@ -128,11 +130,19 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t; // cannot override malloc unless using a dll. // we just override new/delete which does work in a static library. #else - // On all other systems forward to our API + // On all other systems forward allocation primitives to our API mi_decl_export void* malloc(size_t size) MI_FORWARD1(mi_malloc, size) mi_decl_export void* calloc(size_t size, size_t n) MI_FORWARD2(mi_calloc, size, n) mi_decl_export void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize) - mi_decl_export void free(void* p) MI_FORWARD0(mi_free, p) + mi_decl_export void free(void* p) MI_FORWARD0(mi_free, p) + // In principle we do not need to forward `strdup`/`strndup` but on some systems these do not use `malloc` internally (but a more primitive call) + // We only override if `strdup` is not a macro (as on some older libc's, see issue #885) + #if !defined(strdup) + mi_decl_export char* strdup(const char* str) MI_FORWARD1(mi_strdup, str) + #endif + #if !defined(strndup) && (!defined(__APPLE__) || (defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7)) + mi_decl_export char* strndup(const char* str, size_t n) MI_FORWARD2(mi_strndup, str, n) + #endif #endif #if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__) @@ -192,11 +202,17 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t; void _ZdaPv(void* p) MI_FORWARD0(mi_free,p) // delete[] void _ZdlPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n) void _ZdaPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n) + void _ZdlPvSt11align_val_t(void* p, size_t al) { mi_free_aligned(p,al); } void _ZdaPvSt11align_val_t(void* p, size_t al) { mi_free_aligned(p,al); } void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); } void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); } + void _ZdlPvRKSt9nothrow_t(void* p, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free(p); } // operator delete(void*, std::nothrow_t const&) + void _ZdaPvRKSt9nothrow_t(void* p, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free(p); } // operator delete[](void*, std::nothrow_t const&) + void _ZdlPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete(void*, std::align_val_t, std::nothrow_t const&) + void _ZdaPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete[](void*, std::align_val_t, std::nothrow_t const&) + #if (MI_INTPTR_SIZE==8) void* _Znwm(size_t n) MI_FORWARD1(mi_new,n) // new 64-bit void* _Znam(size_t n) MI_FORWARD1(mi_new,n) // new[] 64-bit @@ -259,10 +275,11 @@ extern "C" { // no forwarding here due to aliasing/name mangling issues void cfree(void* p) { mi_free(p); } void* pvalloc(size_t size) { return mi_pvalloc(size); } -void* reallocarray(void* p, size_t count, size_t size) { return mi_reallocarray(p, count, size); } -int reallocarr(void* p, size_t count, size_t size) { return mi_reallocarr(p, count, size); } void* memalign(size_t alignment, size_t size) { return mi_memalign(alignment, size); } void* _aligned_malloc(size_t alignment, size_t size) { return mi_aligned_alloc(alignment, size); } +void* reallocarray(void* p, size_t count, size_t size) { return mi_reallocarray(p, count, size); } +// some systems define reallocarr so mark it as a weak symbol (#751) +mi_decl_weak int reallocarr(void* p, size_t count, size_t size) { return mi_reallocarr(p, count, size); } #if defined(__wasi__) // forward __libc interface (see PR #667) diff --git a/third-party/mimalloc/src/alloc.c b/third-party/mimalloc/src/alloc.c index ffc1747d..86aaae75 100644 --- a/third-party/mimalloc/src/alloc.c +++ b/third-party/mimalloc/src/alloc.c @@ -1,5 +1,5 @@ /* ---------------------------------------------------------------------------- -Copyright (c) 2018-2022, Microsoft Research, Daan Leijen +Copyright (c) 2018-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. @@ -18,6 +18,7 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_IN_ALLOC_C #include "alloc-override.c" +#include "free.c" #undef MI_IN_ALLOC_C // ------------------------------------------------------ @@ -26,16 +27,18 @@ terms of the MIT license. A copy of the license can be found in the file // Fast allocation in a page: just pop from the free list. // Fall back to generic allocation only if the list is empty. -extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept { - mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size); +// Note: in release mode the (inlined) routine is about 7 instructions with a single test. +extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept +{ + mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size); mi_block_t* const block = page->free; if mi_unlikely(block == NULL) { return _mi_malloc_generic(heap, size, zero, 0); } mi_assert_internal(block != NULL && _mi_ptr_page(block) == page); // pop from the free list - page->used++; page->free = mi_block_next(page, block); + page->used++; mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page); #if MI_DEBUG>3 if (page->free_is_zero) { @@ -50,58 +53,66 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz // zero the block? note: we need to zero the full block size (issue #63) if mi_unlikely(zero) { - mi_assert_internal(page->xblock_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic) - mi_assert_internal(page->xblock_size >= MI_PADDING_SIZE); + mi_assert_internal(page->block_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic) + mi_assert_internal(page->block_size >= MI_PADDING_SIZE); if (page->free_is_zero) { block->next = 0; - mi_track_mem_defined(block, page->xblock_size - MI_PADDING_SIZE); + mi_track_mem_defined(block, page->block_size - MI_PADDING_SIZE); } else { - _mi_memzero_aligned(block, page->xblock_size - MI_PADDING_SIZE); - } + _mi_memzero_aligned(block, page->block_size - MI_PADDING_SIZE); + } } -#if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN + #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN if (!zero && !mi_page_is_huge(page)) { memset(block, MI_DEBUG_UNINIT, mi_page_usable_block_size(page)); } -#elif (MI_SECURE!=0) + #elif (MI_SECURE!=0) if (!zero) { block->next = 0; } // don't leak internal data -#endif + #endif -#if (MI_STAT>0) + #if (MI_STAT>0) const size_t bsize = mi_page_usable_block_size(page); if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) { mi_heap_stat_increase(heap, normal, bsize); mi_heap_stat_counter_increase(heap, normal_count, 1); -#if (MI_STAT>1) + #if (MI_STAT>1) const size_t bin = _mi_bin(bsize); mi_heap_stat_increase(heap, normal_bins[bin], 1); -#endif + #endif } -#endif - -#if MI_PADDING // && !MI_TRACK_ENABLED - mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page)); - ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE)); - #if (MI_DEBUG>=2) - mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta)); #endif - mi_track_mem_defined(padding,sizeof(mi_padding_t)); // note: re-enable since mi_page_usable_block_size may set noaccess - padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys)); - padding->delta = (uint32_t)(delta); - #if MI_PADDING_CHECK - if (!mi_page_is_huge(page)) { - uint8_t* fill = (uint8_t*)padding - delta; - const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes - for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; } - } + + #if MI_PADDING // && !MI_TRACK_ENABLED + mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page)); + ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE)); + #if (MI_DEBUG>=2) + mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta)); + #endif + mi_track_mem_defined(padding,sizeof(mi_padding_t)); // note: re-enable since mi_page_usable_block_size may set noaccess + padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys)); + padding->delta = (uint32_t)(delta); + #if MI_PADDING_CHECK + if (!mi_page_is_huge(page)) { + uint8_t* fill = (uint8_t*)padding - delta; + const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes + for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; } + } + #endif #endif -#endif return block; } +// extra entries for improved efficiency in `alloc-aligned.c`. +extern void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept { + return _mi_page_malloc_zero(heap,page,size,false); +} +extern void* _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept { + return _mi_page_malloc_zero(heap,page,size,true); +} + static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept { mi_assert(heap != NULL); #if MI_DEBUG @@ -112,9 +123,11 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, #if (MI_PADDING) if (size == 0) { size = sizeof(void*); } #endif + mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE); - void* const p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE, zero); + void* const p = _mi_page_malloc_zero(heap, page, size + MI_PADDING_SIZE, zero); mi_track_malloc(p,size,zero); + #if MI_STAT>1 if (p != NULL) { if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); } @@ -190,484 +203,6 @@ mi_decl_nodiscard mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept } -// ------------------------------------------------------ -// Check for double free in secure and debug mode -// This is somewhat expensive so only enabled for secure mode 4 -// ------------------------------------------------------ - -#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0)) -// linear check if the free list contains a specific element -static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) { - while (list != NULL) { - if (elem==list) return true; - list = mi_block_next(page, list); - } - return false; -} - -static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) { - // The decoded value is in the same page (or NULL). - // Walk the free lists to verify positively if it is already freed - if (mi_list_contains(page, page->free, block) || - mi_list_contains(page, page->local_free, block) || - mi_list_contains(page, mi_page_thread_free(page), block)) - { - _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page)); - return true; - } - return false; -} - -#define mi_track_page(page,access) { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); } - -static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) { - bool is_double_free = false; - mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field - if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 && // quick check: aligned pointer? - (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL? - { - // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free? - // (continue in separate function to improve code generation) - is_double_free = mi_check_is_double_freex(page, block); - } - return is_double_free; -} -#else -static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) { - MI_UNUSED(page); - MI_UNUSED(block); - return false; -} -#endif - -// --------------------------------------------------------------------------- -// Check for heap block overflow by setting up padding at the end of the block -// --------------------------------------------------------------------------- - -#if MI_PADDING // && !MI_TRACK_ENABLED -static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) { - *bsize = mi_page_usable_block_size(page); - const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize); - mi_track_mem_defined(padding,sizeof(mi_padding_t)); - *delta = padding->delta; - uint32_t canary = padding->canary; - uintptr_t keys[2]; - keys[0] = page->keys[0]; - keys[1] = page->keys[1]; - bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize); - mi_track_mem_noaccess(padding,sizeof(mi_padding_t)); - return ok; -} - -// Return the exact usable size of a block. -static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) { - size_t bsize; - size_t delta; - bool ok = mi_page_decode_padding(page, block, &delta, &bsize); - mi_assert_internal(ok); mi_assert_internal(delta <= bsize); - return (ok ? bsize - delta : 0); -} - -// When a non-thread-local block is freed, it becomes part of the thread delayed free -// list that is freed later by the owning heap. If the exact usable size is too small to -// contain the pointer for the delayed list, then shrink the padding (by decreasing delta) -// so it will later not trigger an overflow error in `mi_free_block`. -void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) { - size_t bsize; - size_t delta; - bool ok = mi_page_decode_padding(page, block, &delta, &bsize); - mi_assert_internal(ok); - if (!ok || (bsize - delta) >= min_size) return; // usually already enough space - mi_assert_internal(bsize >= min_size); - if (bsize < min_size) return; // should never happen - size_t new_delta = (bsize - min_size); - mi_assert_internal(new_delta < bsize); - mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize); - mi_track_mem_defined(padding,sizeof(mi_padding_t)); - padding->delta = (uint32_t)new_delta; - mi_track_mem_noaccess(padding,sizeof(mi_padding_t)); -} -#else -static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) { - MI_UNUSED(block); - return mi_page_usable_block_size(page); -} - -void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) { - MI_UNUSED(page); - MI_UNUSED(block); - MI_UNUSED(min_size); -} -#endif - -#if MI_PADDING && MI_PADDING_CHECK - -static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) { - size_t bsize; - size_t delta; - bool ok = mi_page_decode_padding(page, block, &delta, &bsize); - *size = *wrong = bsize; - if (!ok) return false; - mi_assert_internal(bsize >= delta); - *size = bsize - delta; - if (!mi_page_is_huge(page)) { - uint8_t* fill = (uint8_t*)block + bsize - delta; - const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes - mi_track_mem_defined(fill, maxpad); - for (size_t i = 0; i < maxpad; i++) { - if (fill[i] != MI_DEBUG_PADDING) { - *wrong = bsize - delta + i; - ok = false; - break; - } - } - mi_track_mem_noaccess(fill, maxpad); - } - return ok; -} - -static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) { - size_t size; - size_t wrong; - if (!mi_verify_padding(page,block,&size,&wrong)) { - _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong ); - } -} - -#else - -static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) { - MI_UNUSED(page); - MI_UNUSED(block); -} - -#endif - -// only maintain stats for smaller objects if requested -#if (MI_STAT>0) -static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) { - #if (MI_STAT < 2) - MI_UNUSED(block); - #endif - mi_heap_t* const heap = mi_heap_get_default(); - const size_t bsize = mi_page_usable_block_size(page); - #if (MI_STAT>1) - const size_t usize = mi_page_usable_size_of(page, block); - mi_heap_stat_decrease(heap, malloc, usize); - #endif - if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) { - mi_heap_stat_decrease(heap, normal, bsize); - #if (MI_STAT > 1) - mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1); - #endif - } - else if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { - mi_heap_stat_decrease(heap, large, bsize); - } - else { - mi_heap_stat_decrease(heap, huge, bsize); - } -} -#else -static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) { - MI_UNUSED(page); MI_UNUSED(block); -} -#endif - -#if MI_HUGE_PAGE_ABANDON -#if (MI_STAT>0) -// maintain stats for huge objects -static void mi_stat_huge_free(const mi_page_t* page) { - mi_heap_t* const heap = mi_heap_get_default(); - const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc` - if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { - mi_heap_stat_decrease(heap, large, bsize); - } - else { - mi_heap_stat_decrease(heap, huge, bsize); - } -} -#else -static void mi_stat_huge_free(const mi_page_t* page) { - MI_UNUSED(page); -} -#endif -#endif - -// ------------------------------------------------------ -// Free -// ------------------------------------------------------ - -// multi-threaded free (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON) -static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block) -{ - // The padding check may access the non-thread-owned page for the key values. - // that is safe as these are constant and the page won't be freed (as the block is not freed yet). - mi_check_padding(page, block); - _mi_padding_shrink(page, block, sizeof(mi_block_t)); // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection - - // huge page segments are always abandoned and can be freed immediately - mi_segment_t* segment = _mi_page_segment(page); - if (segment->kind == MI_SEGMENT_HUGE) { - #if MI_HUGE_PAGE_ABANDON - // huge page segments are always abandoned and can be freed immediately - mi_stat_huge_free(page); - _mi_segment_huge_page_free(segment, page, block); - return; - #else - // huge pages are special as they occupy the entire segment - // as these are large we reset the memory occupied by the page so it is available to other threads - // (as the owning thread needs to actually free the memory later). - _mi_segment_huge_page_reset(segment, page, block); - #endif - } - - #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading - if (segment->kind != MI_SEGMENT_HUGE) { // not for huge segments as we just reset the content - memset(block, MI_DEBUG_FREED, mi_usable_size(block)); - } - #endif - - // Try to put the block on either the page-local thread free list, or the heap delayed free list. - mi_thread_free_t tfreex; - bool use_delayed; - mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free); - do { - use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE); - if mi_unlikely(use_delayed) { - // unlikely: this only happens on the first concurrent free in a page that is in the full list - tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING); - } - else { - // usual: directly add to page thread_free list - mi_block_set_next(page, block, mi_tf_block(tfree)); - tfreex = mi_tf_set_block(tfree,block); - } - } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); - - if mi_unlikely(use_delayed) { - // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`) - mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page); - mi_assert_internal(heap != NULL); - if (heap != NULL) { - // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity) - mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free); - do { - mi_block_set_nextx(heap,block,dfree, heap->keys); - } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block)); - } - - // and reset the MI_DELAYED_FREEING flag - tfree = mi_atomic_load_relaxed(&page->xthread_free); - do { - tfreex = tfree; - mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING); - tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE); - } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); - } -} - -// regular free -static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block) -{ - // and push it on the free list - //const size_t bsize = mi_page_block_size(page); - if mi_likely(local) { - // owning thread can free a block directly - if mi_unlikely(mi_check_is_double_free(page, block)) return; - mi_check_padding(page, block); - #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN - if (!mi_page_is_huge(page)) { // huge page content may be already decommitted - memset(block, MI_DEBUG_FREED, mi_page_block_size(page)); - } - #endif - mi_block_set_next(page, block, page->local_free); - page->local_free = block; - page->used--; - if mi_unlikely(mi_page_all_free(page)) { - _mi_page_retire(page); - } - else if mi_unlikely(mi_page_is_in_full(page)) { - _mi_page_unfull(page); - } - } - else { - _mi_free_block_mt(page,block); - } -} - - -// Adjust a block that was allocated aligned, to the actual start of the block in the page. -mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) { - mi_assert_internal(page!=NULL && p!=NULL); - const size_t diff = (uint8_t*)p - _mi_page_start(segment, page, NULL); - const size_t adjust = (diff % mi_page_block_size(page)); - return (mi_block_t*)((uintptr_t)p - adjust); -} - - -void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept { - mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p); - mi_stat_free(page, block); // stat_free may access the padding - mi_track_free_size(block, mi_page_usable_size_of(page,block)); - _mi_free_block(page, is_local, block); -} - -// Get the segment data belonging to a pointer -// This is just a single `and` in assembly but does further checks in debug mode -// (and secure mode) if this was a valid pointer. -static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg) -{ - MI_UNUSED(msg); - mi_assert(p != NULL); - -#if (MI_DEBUG>0) - if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) { - _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p); - return NULL; - } -#endif - - mi_segment_t* const segment = _mi_ptr_segment(p); - mi_assert_internal(segment != NULL); - -#if (MI_DEBUG>0) - if mi_unlikely(!mi_is_in_heap_region(p)) { - #if (MI_INTPTR_SIZE == 8 && defined(__linux__)) - if (((uintptr_t)p >> 40) != 0x7F) { // linux tends to align large blocks above 0x7F000000000 (issue #640) - #else - { - #endif - _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n" - "(this may still be a valid very large allocation (over 64MiB))\n", msg, p); - if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) { - _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p); - } - } - } -#endif -#if (MI_DEBUG>0 || MI_SECURE>=4) - if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) { - _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p); - return NULL; - } -#endif - - return segment; -} - -// Free a block -// fast path written carefully to prevent spilling on the stack -void mi_free(void* p) mi_attr_noexcept -{ - if mi_unlikely(p == NULL) return; - mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free"); - const bool is_local= (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); - mi_page_t* const page = _mi_segment_page_of(segment, p); - - if mi_likely(is_local) { // thread-local free? - if mi_likely(page->flags.full_aligned == 0) // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned) - { - mi_block_t* const block = (mi_block_t*)p; - if mi_unlikely(mi_check_is_double_free(page, block)) return; - mi_check_padding(page, block); - mi_stat_free(page, block); - #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN - memset(block, MI_DEBUG_FREED, mi_page_block_size(page)); - #endif - mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned - mi_block_set_next(page, block, page->local_free); - page->local_free = block; - if mi_unlikely(--page->used == 0) { // using this expression generates better code than: page->used--; if (mi_page_all_free(page)) - _mi_page_retire(page); - } - } - else { - // page is full or contains (inner) aligned blocks; use generic path - _mi_free_generic(segment, page, true, p); - } - } - else { - // not thread-local; use generic path - _mi_free_generic(segment, page, false, p); - } -} - -// return true if successful -bool _mi_free_delayed_block(mi_block_t* block) { - // get segment and page - const mi_segment_t* const segment = _mi_ptr_segment(block); - mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie); - mi_assert_internal(_mi_thread_id() == segment->thread_id); - mi_page_t* const page = _mi_segment_page_of(segment, block); - - // Clear the no-delayed flag so delayed freeing is used again for this page. - // This must be done before collecting the free lists on this page -- otherwise - // some blocks may end up in the page `thread_free` list with no blocks in the - // heap `thread_delayed_free` list which may cause the page to be never freed! - // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`) - if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) { - return false; - } - - // collect all other non-local frees to ensure up-to-date `used` count - _mi_page_free_collect(page, false); - - // and free the block (possibly freeing the page as well since used is updated) - _mi_free_block(page, true, block); - return true; -} - -// Bytes available in a block -mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept { - const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p); - const size_t size = mi_page_usable_size_of(page, block); - const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block; - mi_assert_internal(adjust >= 0 && (size_t)adjust <= size); - return (size - adjust); -} - -static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept { - if (p == NULL) return 0; - const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg); - const mi_page_t* const page = _mi_segment_page_of(segment, p); - if mi_likely(!mi_page_has_aligned(page)) { - const mi_block_t* block = (const mi_block_t*)p; - return mi_page_usable_size_of(page, block); - } - else { - // split out to separate routine for improved code generation - return mi_page_usable_aligned_size_of(segment, page, p); - } -} - -mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept { - return _mi_usable_size(p, "mi_usable_size"); -} - - -// ------------------------------------------------------ -// Allocation extensions -// ------------------------------------------------------ - -void mi_free_size(void* p, size_t size) mi_attr_noexcept { - MI_UNUSED_RELEASE(size); - mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size")); - mi_free(p); -} - -void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept { - MI_UNUSED_RELEASE(alignment); - mi_assert(((uintptr_t)p % alignment) == 0); - mi_free_size(p,size); -} - -void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept { - MI_UNUSED_RELEASE(alignment); - mi_assert(((uintptr_t)p % alignment) == 0); - mi_free(p); -} - mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept { size_t total; if (mi_count_size_overflow(count,size,&total)) return NULL; @@ -794,11 +329,11 @@ mi_decl_nodiscard void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_ // `strdup` using mi_malloc mi_decl_nodiscard mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept { if (s == NULL) return NULL; - size_t n = strlen(s); - char* t = (char*)mi_heap_malloc(heap,n+1); + size_t len = _mi_strlen(s); + char* t = (char*)mi_heap_malloc(heap,len+1); if (t == NULL) return NULL; - _mi_memcpy(t, s, n); - t[n] = 0; + _mi_memcpy(t, s, len); + t[len] = 0; return t; } @@ -809,13 +344,11 @@ mi_decl_nodiscard mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexce // `strndup` using mi_malloc mi_decl_nodiscard mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept { if (s == NULL) return NULL; - const char* end = (const char*)memchr(s, 0, n); // find end of string in the first `n` characters (returns NULL if not found) - const size_t m = (end != NULL ? (size_t)(end - s) : n); // `m` is the minimum of `n` or the end-of-string - mi_assert_internal(m <= n); - char* t = (char*)mi_heap_malloc(heap, m+1); + const size_t len = _mi_strnlen(s,n); // len <= n + char* t = (char*)mi_heap_malloc(heap, len+1); if (t == NULL) return NULL; - _mi_memcpy(t, s, m); - t[m] = 0; + _mi_memcpy(t, s, len); + t[len] = 0; return t; } @@ -869,7 +402,8 @@ char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) char* rname = realpath(fname, NULL); if (rname == NULL) return NULL; char* result = mi_heap_strdup(heap, rname); - free(rname); // use regular free! (which may be redirected to our free but that's ok) + mi_cfree(rname); // use checked free (which may be redirected to our free but that's ok) + // note: with ASAN realpath is intercepted and mi_cfree may leak the returned pointer :-( return result; } /* @@ -913,9 +447,13 @@ static bool mi_try_new_handler(bool nothrow) { #endif if (h==NULL) { _mi_error_message(ENOMEM, "out of memory in 'new'"); + #if defined(_CPPUNWIND) || defined(__cpp_exceptions) // exceptions are not always enabled if (!nothrow) { throw std::bad_alloc(); } + #else + MI_UNUSED(nothrow); + #endif return false; } else { diff --git a/third-party/mimalloc/src/arena.c b/third-party/mimalloc/src/arena.c index a04a04c8..648ee844 100644 --- a/third-party/mimalloc/src/arena.c +++ b/third-party/mimalloc/src/arena.c @@ -13,7 +13,7 @@ threads and need to be accessed using atomic operations. Arenas are used to for huge OS page (1GiB) reservations or for reserving OS memory upfront which can be improve performance or is sometimes needed -on embedded devices. We can also employ this with WASI or `sbrk` systems +on embedded devices. We can also employ this with WASI or `sbrk` systems to reserve large arenas upfront and be able to reuse the memory more effectively. The arena allocation needs to be thread safe and we use an atomic bitmap to allocate. @@ -48,14 +48,16 @@ typedef struct mi_arena_s { size_t meta_size; // size of the arena structure itself (including its bitmaps) mi_memid_t meta_memid; // memid of the arena structure itself (OS or static allocation) int numa_node; // associated NUMA node - bool exclusive; // only allow allocations if specifically for this arena + bool exclusive; // only allow allocations if specifically for this arena bool is_large; // memory area consists of large- or huge OS pages (always committed) _Atomic(size_t) search_idx; // optimization to start the search for free blocks _Atomic(mi_msecs_t) purge_expire; // expiration time when blocks should be decommitted from `blocks_decommit`. mi_bitmap_field_t* blocks_dirty; // are the blocks potentially non-zero? mi_bitmap_field_t* blocks_committed; // are the blocks committed? (can be NULL for memory that cannot be decommitted) - mi_bitmap_field_t* blocks_purge; // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted) + mi_bitmap_field_t* blocks_purge; // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted) + mi_bitmap_field_t* blocks_abandoned; // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here) mi_bitmap_field_t blocks_inuse[1]; // in-place bitmap of in-use blocks (of size `field_count`) + // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields. } mi_arena_t; @@ -94,7 +96,7 @@ bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_i return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id); } else { - return mi_arena_id_is_suitable(0, false, request_arena_id); + return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id); } } @@ -103,7 +105,7 @@ bool _mi_arena_memid_is_os_allocated(mi_memid_t memid) { } /* ----------------------------------------------------------- - Arena allocations get a (currently) 16-bit memory id where the + Arena allocations get a (currently) 16-bit memory id where the lower 8 bits are the arena id, and the upper bits the block index. ----------------------------------------------------------- */ @@ -143,18 +145,19 @@ static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bit #define MI_ARENA_STATIC_MAX (MI_INTPTR_SIZE*MI_KiB) // 8 KiB on 64-bit -static uint8_t mi_arena_static[MI_ARENA_STATIC_MAX]; -static _Atomic(size_t) mi_arena_static_top; +static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX]; // must be cache aligned, see issue #895 +static mi_decl_cache_align _Atomic(size_t) mi_arena_static_top; static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) { *memid = _mi_memid_none(); if (size == 0 || size > MI_ARENA_STATIC_MAX) return NULL; - if ((mi_atomic_load_relaxed(&mi_arena_static_top) + size) > MI_ARENA_STATIC_MAX) return NULL; + const size_t toplow = mi_atomic_load_relaxed(&mi_arena_static_top); + if ((toplow + size) > MI_ARENA_STATIC_MAX) return NULL; // try to claim space - if (alignment == 0) { alignment = 1; } + if (alignment < MI_MAX_ALIGN_SIZE) { alignment = MI_MAX_ALIGN_SIZE; } const size_t oversize = size + alignment - 1; - if (oversize > MI_ARENA_STATIC_MAX) return NULL; + if (toplow + oversize > MI_ARENA_STATIC_MAX) return NULL; const size_t oldtop = mi_atomic_add_acq_rel(&mi_arena_static_top, oversize); size_t top = oldtop + oversize; if (top > MI_ARENA_STATIC_MAX) { @@ -165,9 +168,10 @@ static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* m // success *memid = _mi_memid_create(MI_MEM_STATIC); + memid->initially_zero = true; const size_t start = _mi_align_up(oldtop, alignment); uint8_t* const p = &mi_arena_static[start]; - _mi_memzero(p, size); + _mi_memzero_aligned(p, size); return p; } @@ -175,11 +179,19 @@ static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* st *memid = _mi_memid_none(); // try static - void* p = mi_arena_static_zalloc(size, MI_ALIGNMENT_MAX, memid); + void* p = mi_arena_static_zalloc(size, MI_MAX_ALIGN_SIZE, memid); if (p != NULL) return p; // or fall back to the OS - return _mi_os_alloc(size, memid, stats); + p = _mi_os_alloc(size, memid, stats); + if (p == NULL) return NULL; + + // zero the OS memory if needed + if (!memid->initially_zero) { + _mi_memzero_aligned(p, size); + memid->initially_zero = true; + } + return p; } static void mi_arena_meta_free(void* p, mi_memid_t memid, size_t size, mi_stats_t* stats) { @@ -201,11 +213,11 @@ static void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) { ----------------------------------------------------------- */ // claim the `blocks_inuse` bits -static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx) +static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) { size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx); // start from last search; ok to be relaxed as the exact start does not matter - if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) { - mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx)); // start search from found location next time around + if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) { + mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx)); // start search from found location next time around return true; }; return false; @@ -223,9 +235,9 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar mi_assert_internal(mi_arena_id_index(arena->id) == arena_index); mi_bitmap_index_t bitmap_index; - if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index)) return NULL; + if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index, tld->stats)) return NULL; - // claimed it! + // claimed it! void* p = mi_arena_block_start(arena, bitmap_index); *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index); memid->is_pinned = arena->memid.is_pinned; @@ -265,21 +277,21 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar // no need to commit, but check if already fully committed memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index); } - + return p; } // allocate in a speficic arena -static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment, - bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) +static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment, + bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) { MI_UNUSED_RELEASE(alignment); mi_assert_internal(alignment <= MI_SEGMENT_ALIGN); - const size_t bcount = mi_block_count_of_size(size); + const size_t bcount = mi_block_count_of_size(size); const size_t arena_index = mi_arena_id_index(arena_id); mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count)); mi_assert_internal(size <= mi_arena_block_size(bcount)); - + // Check arena suitability mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); if (arena == NULL) return NULL; @@ -299,7 +311,7 @@ static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_no // allocate from an arena with fallback to the OS -static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, +static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) { @@ -307,9 +319,9 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz mi_assert_internal(alignment <= MI_SEGMENT_ALIGN); const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); if mi_likely(max_arena == 0) return NULL; - + if (req_arena_id != _mi_arena_id_none()) { - // try a specific arena if requested + // try a specific arena if requested if (mi_arena_id_index(req_arena_id) < max_arena) { void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); if (p != NULL) return p; @@ -317,7 +329,7 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz } else { // try numa affine allocation - for (size_t i = 0; i < max_arena; i++) { + for (size_t i = 0; i < max_arena; i++) { void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); if (p != NULL) return p; } @@ -345,22 +357,22 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve); if (arena_reserve == 0) return false; - if (!_mi_os_has_virtual_reserve()) { - arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for some embedded systems for example) + if (!_mi_os_has_virtual_reserve()) { + arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for WASM for example) } arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE); if (arena_count >= 8 && arena_count <= 128) { arena_reserve = ((size_t)1<<(arena_count/8)) * arena_reserve; // scale up the arena sizes exponentially - } + } if (arena_reserve < req_size) return false; // should be able to at least handle the current allocation size - + // commit eagerly? bool arena_commit = false; if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } - return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive */, arena_id) == 0); -} + return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0); +} void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, @@ -373,35 +385,37 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset const int numa_node = _mi_os_numa_node(tld); // current numa node // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) - if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) { - void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); - if (p != NULL) return p; - - // otherwise, try to first eagerly reserve a new arena - if (req_arena_id == _mi_arena_id_none()) { - mi_arena_id_t arena_id = 0; - if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) { - // and try allocate in there - mi_assert_internal(req_arena_id == _mi_arena_id_none()); - p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); - if (p != NULL) return p; + if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) { // is arena allocation allowed? + if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) { + void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + + // otherwise, try to first eagerly reserve a new arena + if (req_arena_id == _mi_arena_id_none()) { + mi_arena_id_t arena_id = 0; + if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) { + // and try allocate in there + mi_assert_internal(req_arena_id == _mi_arena_id_none()); + p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + } } } } // if we cannot use OS allocation, return NULL - if (mi_option_is_enabled(mi_option_limit_os_alloc) || req_arena_id != _mi_arena_id_none()) { + if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) { errno = ENOMEM; return NULL; } - + // finally, fall back to the OS if (align_offset > 0) { return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats); } else { return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats); - } + } } void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) @@ -437,22 +451,22 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_assert_internal(arena->blocks_purge != NULL); mi_assert_internal(!arena->memid.is_pinned); const size_t size = mi_arena_block_size(blocks); - void* const p = mi_arena_block_start(arena, bitmap_idx); + void* const p = mi_arena_block_start(arena, bitmap_idx); bool needs_recommit; if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) { // all blocks are committed, we can purge freely needs_recommit = _mi_os_purge(p, size, stats); } else { - // some blocks are not committed -- this can happen when a partially committed block is freed + // some blocks are not committed -- this can happen when a partially committed block is freed // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge - // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), + // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), // and also undo the decommit stats (as it was already adjusted) mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits)); needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats); - _mi_stat_increase(&stats->committed, size); + if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); } } - + // clear the purged blocks _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx); // update committed bitmap @@ -470,13 +484,13 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t if (_mi_preloading() || delay == 0) { // decommit directly - mi_arena_purge(arena, bitmap_idx, blocks, stats); + mi_arena_purge(arena, bitmap_idx, blocks, stats); } else { // schedule decommit mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); if (expire != 0) { - mi_atomic_addi64_acq_rel(&arena->purge_expire, delay/10); // add smallish extra delay + mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10)); // add smallish extra delay } else { mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay); @@ -512,7 +526,7 @@ static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, } // returns true if anything was purged -static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats) +static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats) { if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false; mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); @@ -520,11 +534,11 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi if (!force && expire > now) return false; // reset expire (if not already set concurrently) - mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, 0); - + mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0); + // potential purges scheduled, walk through the bitmap bool any_purged = false; - bool full_purge = true; + bool full_purge = true; for (size_t i = 0; i < arena->field_count; i++) { size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]); if (purge != 0) { @@ -575,7 +589,7 @@ static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats ) // allow only one thread to purge at a time static mi_atomic_guard_t purge_guard; - mi_atomic_guard(&purge_guard) + mi_atomic_guard(&purge_guard) { mi_msecs_t now = _mi_clock_now(); size_t max_purge_count = (visit_all ? max_arena : 1); @@ -588,7 +602,7 @@ static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats ) } } } - } + } } @@ -602,12 +616,12 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi if (p==NULL) return; if (size==0) return; const bool all_committed = (committed_size == size); - + if (mi_memkind_is_os(memid.memkind)) { // was a direct OS allocation, pass through if (!all_committed && committed_size > 0) { // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size) - _mi_stat_decrease(&stats->committed, committed_size); + _mi_stat_decrease(&_mi_stats_main.committed, committed_size); } _mi_os_free(p, size, memid, stats); } @@ -620,15 +634,15 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]); mi_assert_internal(arena != NULL); const size_t blocks = mi_block_count_of_size(size); - + // checks if (arena == NULL) { - _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); + _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); return; } mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx)); if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) { - _mi_error_message(EINVAL, "trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); + _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); return; } @@ -642,7 +656,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi else { mi_assert_internal(arena->blocks_committed != NULL); mi_assert_internal(arena->blocks_purge != NULL); - + if (!all_committed) { // mark the entire range as no longer committed (so we recommit the full range when re-using) _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx); @@ -650,16 +664,16 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi if (committed_size > 0) { // if partially committed, adjust the committed stats (is it will be recommitted when re-using) // in the delayed purge, we now need to not count a decommit if the range is not marked as committed. - _mi_stat_decrease(&stats->committed, committed_size); + _mi_stat_decrease(&_mi_stats_main.committed, committed_size); } // note: if not all committed, it may be that the purge will reset/decommit the entire range // that contains already decommitted parts. Since purge consistently uses reset or decommit that // works (as we should never reset decommitted parts). } // (delay) purge the entire range - mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats); + mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats); } - + // and make it available to others again bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx); if (!all_inuse) { @@ -684,9 +698,9 @@ static void mi_arenas_unsafe_destroy(void) { for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); if (arena != NULL) { - if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) { + if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) { mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); - _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main); + _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main); } else { new_max_arena = i; @@ -701,15 +715,15 @@ static void mi_arenas_unsafe_destroy(void) { } // Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired -void _mi_arena_collect(bool force_purge, mi_stats_t* stats) { - mi_arenas_try_purge(force_purge, true /* visit all */, stats); +void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) { + mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats); } // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` // for dynamic libraries that are unloaded and need to release all their allocated memory. void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) { mi_arenas_unsafe_destroy(); - _mi_arena_collect(true /* force purge */, stats); // purge non-owned arenas + _mi_arenas_collect(true /* force purge */, stats); // purge non-owned arenas } // Is a pointer inside any of our arenas? @@ -717,19 +731,151 @@ bool _mi_arena_contains(const void* p) { const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); - if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) { - return true; + if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) { + return true; } } return false; } +/* ----------------------------------------------------------- + Abandoned blocks/segments. + This is used to atomically abandon/reclaim segments + (and crosses the arena API but it is convenient to have here). + Abandoned segments still have live blocks; they get reclaimed + when a thread frees a block in it, or when a thread needs a fresh + segment; these threads scan the abandoned segments through + the arena bitmaps. +----------------------------------------------------------- */ + +// Maintain a count of all abandoned segments +static mi_decl_cache_align _Atomic(size_t)abandoned_count; + +size_t _mi_arena_segment_abandoned_count(void) { + return mi_atomic_load_relaxed(&abandoned_count); +} + +// reclaim a specific abandoned segment; `true` on success. +// sets the thread_id. +bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment ) +{ + if (segment->memid.memkind != MI_MEM_ARENA) { + // not in an arena, consider it un-abandoned now. + // but we need to still claim it atomically -- we use the thread_id for that. + size_t expected = 0; + if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected, _mi_thread_id())) { + mi_atomic_decrement_relaxed(&abandoned_count); + return true; + } + else { + return false; + } + } + // arena segment: use the blocks_abandoned bitmap. + size_t arena_idx; + size_t bitmap_idx; + mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx); + mi_assert_internal(arena_idx < MI_MAX_ARENAS); + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]); + mi_assert_internal(arena != NULL); + bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx); + if (was_marked) { + mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0); + mi_atomic_decrement_relaxed(&abandoned_count); + mi_atomic_store_release(&segment->thread_id, _mi_thread_id()); + } + // mi_assert_internal(was_marked); + mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); + //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); + return was_marked; +} + +// mark a specific segment as abandoned +// clears the thread_id. +void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) +{ + mi_atomic_store_release(&segment->thread_id, 0); + mi_assert_internal(segment->used == segment->abandoned); + if (segment->memid.memkind != MI_MEM_ARENA) { + // not in an arena; count it as abandoned and return + mi_atomic_increment_relaxed(&abandoned_count); + return; + } + size_t arena_idx; + size_t bitmap_idx; + mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx); + mi_assert_internal(arena_idx < MI_MAX_ARENAS); + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]); + mi_assert_internal(arena != NULL); + const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL); + if (was_unmarked) { mi_atomic_increment_relaxed(&abandoned_count); } + mi_assert_internal(was_unmarked); + mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); +} + +// start a cursor at a randomized arena +void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current) { + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + current->start = (max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena)); + current->count = 0; + current->bitmap_idx = 0; +} + +// reclaim abandoned segments +// this does not set the thread id (so it appears as still abandoned) +mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous ) +{ + const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count); + if (max_arena <= 0 || mi_atomic_load_relaxed(&abandoned_count) == 0) return NULL; + + int count = previous->count; + size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx); + size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx) + 1; + // visit arena's (from previous) + for (; count < max_arena; count++, field_idx = 0, bit_idx = 0) { + mi_arena_id_t arena_idx = previous->start + count; + if (arena_idx >= max_arena) { arena_idx = arena_idx % max_arena; } // wrap around + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]); + if (arena != NULL) { + // visit the abandoned fields (starting at previous_idx) + for ( ; field_idx < arena->field_count; field_idx++, bit_idx = 0) { + size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]); + if mi_unlikely(field != 0) { // skip zero fields quickly + // visit each set bit in the field (todo: maybe use `ctz` here?) + for ( ; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) { + // pre-check if the bit is set + size_t mask = ((size_t)1 << bit_idx); + if mi_unlikely((field & mask) == mask) { + mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx); + // try to reclaim it atomically + if (_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) { + mi_atomic_decrement_relaxed(&abandoned_count); + previous->bitmap_idx = bitmap_idx; + previous->count = count; + mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); + mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx); + mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0); + //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); + return segment; + } + } + } + } + } + } + } + // no more found + previous->bitmap_idx = 0; + previous->count = 0; + return NULL; +} + /* ----------------------------------------------------------- Add an arena. ----------------------------------------------------------- */ -static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id) { +static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { mi_assert_internal(arena != NULL); mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0); mi_assert_internal(arena->block_count > 0); @@ -740,6 +886,7 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id) { mi_atomic_decrement_acq_rel(&mi_arena_count); return false; } + _mi_stat_counter_increase(&stats->arena_count,1); arena->id = mi_arena_id_create(i); mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); if (arena_id != NULL) { *arena_id = arena->id; } @@ -757,13 +904,13 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int const size_t bcount = size / MI_ARENA_BLOCK_SIZE; const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS); - const size_t bitmaps = (memid.is_pinned ? 2 : 4); + const size_t bitmaps = (memid.is_pinned ? 3 : 5); const size_t asize = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t)); mi_memid_t meta_memid; mi_arena_t* arena = (mi_arena_t*)mi_arena_meta_zalloc(asize, &meta_memid, &_mi_stats_main); // TODO: can we avoid allocating from the OS? if (arena == NULL) return false; - - // already zero'd due to os_alloc + + // already zero'd due to zalloc // _mi_memzero(arena, asize); arena->id = _mi_arena_id_none(); arena->memid = memid; @@ -777,14 +924,16 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int arena->is_large = is_large; arena->purge_expire = 0; arena->search_idx = 0; - arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap - arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap - arena->blocks_purge = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after committed bitmap + // consequetive bitmaps + arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap + arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap + arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap + arena->blocks_purge = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[4*fields]); // just after committed bitmap // initialize committed bitmap? if (arena->blocks_committed != NULL && arena->memid.initially_committed) { memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning } - + // and claim leftover blocks if needed (so we never allocate there) ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount; mi_assert_internal(post >= 0); @@ -793,7 +942,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post); _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL); } - return mi_arena_add(arena, arena_id); + return mi_arena_add(arena, arena_id, &_mi_stats_main); } @@ -815,7 +964,7 @@ int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exc const bool is_large = memid.is_pinned; // todo: use separate is_large field? if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main); - _mi_verbose_message("failed to reserve %zu k memory\n", _mi_divide_up(size, 1024)); + _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); return ENOMEM; } _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : ""); @@ -838,32 +987,55 @@ int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noe Debugging ----------------------------------------------------------- */ -static size_t mi_debug_show_bitmap(const char* prefix, mi_bitmap_field_t* fields, size_t field_count ) { +static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_field_t* fields, size_t field_count ) { + _mi_verbose_message("%s%s:\n", prefix, header); + size_t bcount = 0; size_t inuse_count = 0; for (size_t i = 0; i < field_count; i++) { char buf[MI_BITMAP_FIELD_BITS + 1]; uintptr_t field = mi_atomic_load_relaxed(&fields[i]); - for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++) { - bool inuse = ((((uintptr_t)1 << bit) & field) != 0); - if (inuse) inuse_count++; - buf[MI_BITMAP_FIELD_BITS - 1 - bit] = (inuse ? 'x' : '.'); + for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++, bcount++) { + if (bcount < block_count) { + bool inuse = ((((uintptr_t)1 << bit) & field) != 0); + if (inuse) inuse_count++; + buf[bit] = (inuse ? 'x' : '.'); + } + else { + buf[bit] = ' '; + } } buf[MI_BITMAP_FIELD_BITS] = 0; - _mi_verbose_message("%s%s\n", prefix, buf); + _mi_verbose_message("%s %s\n", prefix, buf); } + _mi_verbose_message("%s total ('x'): %zu\n", prefix, inuse_count); return inuse_count; } -void mi_debug_show_arenas(void) mi_attr_noexcept { +void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept { size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); + size_t inuse_total = 0; + size_t abandoned_total = 0; + size_t purge_total = 0; for (size_t i = 0; i < max_arenas; i++) { mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); if (arena == NULL) break; - size_t inuse_count = 0; - _mi_verbose_message("arena %zu: %zu blocks with %zu fields\n", i, arena->block_count, arena->field_count); - inuse_count += mi_debug_show_bitmap(" ", arena->blocks_inuse, arena->field_count); - _mi_verbose_message(" blocks in use ('x'): %zu\n", inuse_count); + _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : "")); + if (show_inuse) { + inuse_total += mi_debug_show_bitmap(" ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count); + } + if (arena->blocks_committed != NULL) { + mi_debug_show_bitmap(" ", "committed blocks", arena->block_count, arena->blocks_committed, arena->field_count); + } + if (show_abandoned) { + abandoned_total += mi_debug_show_bitmap(" ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count); + } + if (show_purge && arena->blocks_purge != NULL) { + purge_total += mi_debug_show_bitmap(" ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count); + } } + if (show_inuse) _mi_verbose_message("total inuse blocks : %zu\n", inuse_total); + if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total); + if (show_purge) _mi_verbose_message("total purgeable blocks: %zu\n", purge_total); } diff --git a/third-party/mimalloc/src/bitmap.c b/third-party/mimalloc/src/bitmap.c index a13dbe15..4b6be66b 100644 --- a/third-party/mimalloc/src/bitmap.c +++ b/third-party/mimalloc/src/bitmap.c @@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file /* ---------------------------------------------------------------------------- Concurrent bitmap that can set/reset sequences of bits atomically, -represeted as an array of fields where each field is a machine word (`size_t`) +represented as an array of fields where each field is a machine word (`size_t`) There are two api's; the standard one cannot have sequences that cross between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS). @@ -200,7 +200,7 @@ bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t // Try to atomically claim a sequence of `count` bits starting from the field // at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success. // Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`) -static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx) +static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) { mi_assert_internal(bitmap_idx != NULL); @@ -260,6 +260,7 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); // claimed! + mi_stat_counter_increase(stats->arena_crossover_count,1); *bitmap_idx = mi_bitmap_index_create(idx, initial_idx); return true; @@ -279,9 +280,10 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit newmap = (map & ~initial_mask); } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); } + mi_stat_counter_increase(stats->arena_rollback_count,1); // retry? (we make a recursive call instead of goto to be able to use const declarations) if (retries <= 2) { - return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx); + return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx, stats); } else { return false; @@ -291,7 +293,7 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit // Find `count` bits of zeros and set them to 1 atomically; returns `true` on success. // Starts at idx, and wraps around to search in all `bitmap_fields` fields. -bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) { +bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) { mi_assert_internal(count > 0); if (count <= 2) { // we don't bother with crossover fields for small counts @@ -303,13 +305,15 @@ bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitm for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) { if (idx >= bitmap_fields) { idx = 0; } // wrap // first try to claim inside a field + /* if (count <= MI_BITMAP_FIELD_BITS) { if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) { return true; } } + */ // if that fails, then try to claim across fields - if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx)) { + if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx, stats)) { return true; } } diff --git a/third-party/mimalloc/src/bitmap.h b/third-party/mimalloc/src/bitmap.h index 0a765c71..d8316b83 100644 --- a/third-party/mimalloc/src/bitmap.h +++ b/third-party/mimalloc/src/bitmap.h @@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file /* ---------------------------------------------------------------------------- Concurrent bitmap that can set/reset sequences of bits atomically, -represeted as an array of fields where each field is a machine word (`size_t`) +represented as an array of fields where each field is a machine word (`size_t`) There are two api's; the standard one cannot have sequences that cross between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS). @@ -99,7 +99,7 @@ bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t // Find `count` bits of zeros and set them to 1 atomically; returns `true` on success. // Starts at idx, and wraps around to search in all `bitmap_fields` fields. -bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx); +bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats); // Set `count` bits at `bitmap_idx` to 0 atomically // Returns `true` if all `count` bits were 1 previously. diff --git a/third-party/mimalloc/src/free.c b/third-party/mimalloc/src/free.c new file mode 100644 index 00000000..b9cb6346 --- /dev/null +++ b/third-party/mimalloc/src/free.c @@ -0,0 +1,530 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2018-2024, Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ +#if !defined(MI_IN_ALLOC_C) +#error "this file should be included from 'alloc.c' (so aliases can work from alloc-override)" +// add includes help an IDE +#include "mimalloc.h" +#include "mimalloc/internal.h" +#include "mimalloc/atomic.h" +#include "mimalloc/prim.h" // _mi_prim_thread_id() +#endif + +// forward declarations +static void mi_check_padding(const mi_page_t* page, const mi_block_t* block); +static bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block); +static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block); +static void mi_stat_free(const mi_page_t* page, const mi_block_t* block); + + +// ------------------------------------------------------ +// Free +// ------------------------------------------------------ + +// forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON) +static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block); + +// regular free of a (thread local) block pointer +// fast path written carefully to prevent spilling on the stack +static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool track_stats, bool check_full) +{ + // checks + if mi_unlikely(mi_check_is_double_free(page, block)) return; + mi_check_padding(page, block); + if (track_stats) { mi_stat_free(page, block); } + #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN + if (!mi_page_is_huge(page)) { // huge page content may be already decommitted + memset(block, MI_DEBUG_FREED, mi_page_block_size(page)); + } + #endif + if (track_stats) { mi_track_free_size(block, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned + + // actual free: push on the local free list + mi_block_set_next(page, block, page->local_free); + page->local_free = block; + if mi_unlikely(--page->used == 0) { + _mi_page_retire(page); + } + else if mi_unlikely(check_full && mi_page_is_in_full(page)) { + _mi_page_unfull(page); + } +} + +// Adjust a block that was allocated aligned, to the actual start of the block in the page. +// note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the +// `page_start` and `block_size` fields; however these are constant and the page won't be +// deallocated (as the block we are freeing keeps it alive) and thus safe to read concurrently. +mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) { + mi_assert_internal(page!=NULL && p!=NULL); + + size_t diff = (uint8_t*)p - page->page_start; + size_t adjust; + if mi_likely(page->block_size_shift != 0) { + adjust = diff & (((size_t)1 << page->block_size_shift) - 1); + } + else { + adjust = diff % mi_page_block_size(page); + } + + return (mi_block_t*)((uintptr_t)p - adjust); +} + +// free a local pointer (page parameter comes first for better codegen) +static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept { + MI_UNUSED(segment); + mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p); + mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */); +} + +// free a pointer owned by another thread (page parameter comes first for better codegen) +static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept { + mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865) + mi_free_block_mt(page, segment, block); +} + +// generic free (for runtime integration) +void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept { + if (is_local) mi_free_generic_local(page,segment,p); + else mi_free_generic_mt(page,segment,p); +} + +// Get the segment data belonging to a pointer +// This is just a single `and` in release mode but does further checks in debug mode +// (and secure mode) to see if this was a valid pointer. +static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg) +{ + MI_UNUSED(msg); + +#if (MI_DEBUG>0) + if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) { + _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p); + return NULL; + } +#endif + + mi_segment_t* const segment = _mi_ptr_segment(p); + if mi_unlikely(segment==NULL) return segment; + +#if (MI_DEBUG>0) + if mi_unlikely(!mi_is_in_heap_region(p)) { + #if (MI_INTPTR_SIZE == 8 && defined(__linux__)) + if (((uintptr_t)p >> 40) != 0x7F) { // linux tends to align large blocks above 0x7F000000000 (issue #640) + #else + { + #endif + _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n" + "(this may still be a valid very large allocation (over 64MiB))\n", msg, p); + if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) { + _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p); + } + } + } +#endif +#if (MI_DEBUG>0 || MI_SECURE>=4) + if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) { + _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p); + return NULL; + } +#endif + + return segment; +} + +// Free a block +// Fast path written carefully to prevent register spilling on the stack +void mi_free(void* p) mi_attr_noexcept +{ + mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free"); + if mi_unlikely(segment==NULL) return; + + const bool is_local = (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); + mi_page_t* const page = _mi_segment_page_of(segment, p); + + if mi_likely(is_local) { // thread-local free? + if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned) + // thread-local, aligned, and not a full page + mi_block_t* const block = (mi_block_t*)p; + mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */); + } + else { + // page is full or contains (inner) aligned blocks; use generic path + mi_free_generic_local(page, segment, p); + } + } + else { + // not thread-local; use generic path + mi_free_generic_mt(page, segment, p); + } +} + +// return true if successful +bool _mi_free_delayed_block(mi_block_t* block) { + // get segment and page + mi_assert_internal(block!=NULL); + const mi_segment_t* const segment = _mi_ptr_segment(block); + mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie); + mi_assert_internal(_mi_thread_id() == segment->thread_id); + mi_page_t* const page = _mi_segment_page_of(segment, block); + + // Clear the no-delayed flag so delayed freeing is used again for this page. + // This must be done before collecting the free lists on this page -- otherwise + // some blocks may end up in the page `thread_free` list with no blocks in the + // heap `thread_delayed_free` list which may cause the page to be never freed! + // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`) + if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) { + return false; + } + + // collect all other non-local frees (move from `thread_free` to `free`) to ensure up-to-date `used` count + _mi_page_free_collect(page, false); + + // and free the block (possibly freeing the page as well since `used` is updated) + mi_free_block_local(page, block, false /* stats have already been adjusted */, true /* check for a full page */); + return true; +} + +// ------------------------------------------------------ +// Multi-threaded Free (`_mt`) +// ------------------------------------------------------ + +// Push a block that is owned by another thread on its page-local thread free +// list or it's heap delayed free list. Such blocks are later collected by +// the owning thread in `_mi_free_delayed_block`. +static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block ) +{ + // Try to put the block on either the page-local thread free list, + // or the heap delayed free list (if this is the first non-local free in that page) + mi_thread_free_t tfreex; + bool use_delayed; + mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free); + do { + use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE); + if mi_unlikely(use_delayed) { + // unlikely: this only happens on the first concurrent free in a page that is in the full list + tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING); + } + else { + // usual: directly add to page thread_free list + mi_block_set_next(page, block, mi_tf_block(tfree)); + tfreex = mi_tf_set_block(tfree,block); + } + } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); + + // If this was the first non-local free, we need to push it on the heap delayed free list instead + if mi_unlikely(use_delayed) { + // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`) + mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page); + mi_assert_internal(heap != NULL); + if (heap != NULL) { + // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity) + mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free); + do { + mi_block_set_nextx(heap,block,dfree, heap->keys); + } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block)); + } + + // and reset the MI_DELAYED_FREEING flag + tfree = mi_atomic_load_relaxed(&page->xthread_free); + do { + tfreex = tfree; + mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING); + tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE); + } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); + } +} + +// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON) +static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block) +{ + // first see if the segment was abandoned and if we can reclaim it into our thread + if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && + #if MI_HUGE_PAGE_ABANDON + segment->page_kind != MI_PAGE_HUGE && + #endif + mi_atomic_load_relaxed(&segment->thread_id) == 0) + { + // the segment is abandoned, try to reclaim it into our heap + if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) { + mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); + mi_free(block); // recursively free as now it will be a local free in our heap + return; + } + } + + // The padding check may access the non-thread-owned page for the key values. + // that is safe as these are constant and the page won't be freed (as the block is not freed yet). + mi_check_padding(page, block); + + // adjust stats (after padding check and potentially recursive `mi_free` above) + mi_stat_free(page, block); // stat_free may access the padding + mi_track_free_size(block, mi_page_usable_size_of(page,block)); + + // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection + _mi_padding_shrink(page, block, sizeof(mi_block_t)); + + if (segment->kind == MI_SEGMENT_HUGE) { + #if MI_HUGE_PAGE_ABANDON + // huge page segments are always abandoned and can be freed immediately + _mi_segment_huge_page_free(segment, page, block); + return; + #else + // huge pages are special as they occupy the entire segment + // as these are large we reset the memory occupied by the page so it is available to other threads + // (as the owning thread needs to actually free the memory later). + _mi_segment_huge_page_reset(segment, page, block); + #endif + } + else { + #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading + memset(block, MI_DEBUG_FREED, mi_usable_size(block)); + #endif + } + + // and finally free the actual block by pushing it on the owning heap + // thread_delayed free list (or heap delayed free list) + mi_free_block_delayed_mt(page,block); +} + + +// ------------------------------------------------------ +// Usable size +// ------------------------------------------------------ + +// Bytes available in a block +static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* page, const void* p) mi_attr_noexcept { + const mi_block_t* block = _mi_page_ptr_unalign(page, p); + const size_t size = mi_page_usable_size_of(page, block); + const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block; + mi_assert_internal(adjust >= 0 && (size_t)adjust <= size); + return (size - adjust); +} + +static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept { + const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg); + if mi_unlikely(segment==NULL) return 0; + const mi_page_t* const page = _mi_segment_page_of(segment, p); + if mi_likely(!mi_page_has_aligned(page)) { + const mi_block_t* block = (const mi_block_t*)p; + return mi_page_usable_size_of(page, block); + } + else { + // split out to separate routine for improved code generation + return mi_page_usable_aligned_size_of(page, p); + } +} + +mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept { + return _mi_usable_size(p, "mi_usable_size"); +} + + +// ------------------------------------------------------ +// Free variants +// ------------------------------------------------------ + +void mi_free_size(void* p, size_t size) mi_attr_noexcept { + MI_UNUSED_RELEASE(size); + mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size")); + mi_free(p); +} + +void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept { + MI_UNUSED_RELEASE(alignment); + mi_assert(((uintptr_t)p % alignment) == 0); + mi_free_size(p,size); +} + +void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept { + MI_UNUSED_RELEASE(alignment); + mi_assert(((uintptr_t)p % alignment) == 0); + mi_free(p); +} + + +// ------------------------------------------------------ +// Check for double free in secure and debug mode +// This is somewhat expensive so only enabled for secure mode 4 +// ------------------------------------------------------ + +#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0)) +// linear check if the free list contains a specific element +static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) { + while (list != NULL) { + if (elem==list) return true; + list = mi_block_next(page, list); + } + return false; +} + +static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) { + // The decoded value is in the same page (or NULL). + // Walk the free lists to verify positively if it is already freed + if (mi_list_contains(page, page->free, block) || + mi_list_contains(page, page->local_free, block) || + mi_list_contains(page, mi_page_thread_free(page), block)) + { + _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page)); + return true; + } + return false; +} + +#define mi_track_page(page,access) { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); } + +static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) { + bool is_double_free = false; + mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field + if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 && // quick check: aligned pointer? + (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL? + { + // Suspicious: decoded value a in block is in the same page (or NULL) -- maybe a double free? + // (continue in separate function to improve code generation) + is_double_free = mi_check_is_double_freex(page, block); + } + return is_double_free; +} +#else +static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) { + MI_UNUSED(page); + MI_UNUSED(block); + return false; +} +#endif + + +// --------------------------------------------------------------------------- +// Check for heap block overflow by setting up padding at the end of the block +// --------------------------------------------------------------------------- + +#if MI_PADDING // && !MI_TRACK_ENABLED +static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) { + *bsize = mi_page_usable_block_size(page); + const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize); + mi_track_mem_defined(padding,sizeof(mi_padding_t)); + *delta = padding->delta; + uint32_t canary = padding->canary; + uintptr_t keys[2]; + keys[0] = page->keys[0]; + keys[1] = page->keys[1]; + bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize); + mi_track_mem_noaccess(padding,sizeof(mi_padding_t)); + return ok; +} + +// Return the exact usable size of a block. +static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) { + size_t bsize; + size_t delta; + bool ok = mi_page_decode_padding(page, block, &delta, &bsize); + mi_assert_internal(ok); mi_assert_internal(delta <= bsize); + return (ok ? bsize - delta : 0); +} + +// When a non-thread-local block is freed, it becomes part of the thread delayed free +// list that is freed later by the owning heap. If the exact usable size is too small to +// contain the pointer for the delayed list, then shrink the padding (by decreasing delta) +// so it will later not trigger an overflow error in `mi_free_block`. +void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) { + size_t bsize; + size_t delta; + bool ok = mi_page_decode_padding(page, block, &delta, &bsize); + mi_assert_internal(ok); + if (!ok || (bsize - delta) >= min_size) return; // usually already enough space + mi_assert_internal(bsize >= min_size); + if (bsize < min_size) return; // should never happen + size_t new_delta = (bsize - min_size); + mi_assert_internal(new_delta < bsize); + mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize); + mi_track_mem_defined(padding,sizeof(mi_padding_t)); + padding->delta = (uint32_t)new_delta; + mi_track_mem_noaccess(padding,sizeof(mi_padding_t)); +} +#else +static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) { + MI_UNUSED(block); + return mi_page_usable_block_size(page); +} + +void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) { + MI_UNUSED(page); + MI_UNUSED(block); + MI_UNUSED(min_size); +} +#endif + +#if MI_PADDING && MI_PADDING_CHECK + +static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) { + size_t bsize; + size_t delta; + bool ok = mi_page_decode_padding(page, block, &delta, &bsize); + *size = *wrong = bsize; + if (!ok) return false; + mi_assert_internal(bsize >= delta); + *size = bsize - delta; + if (!mi_page_is_huge(page)) { + uint8_t* fill = (uint8_t*)block + bsize - delta; + const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes + mi_track_mem_defined(fill, maxpad); + for (size_t i = 0; i < maxpad; i++) { + if (fill[i] != MI_DEBUG_PADDING) { + *wrong = bsize - delta + i; + ok = false; + break; + } + } + mi_track_mem_noaccess(fill, maxpad); + } + return ok; +} + +static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) { + size_t size; + size_t wrong; + if (!mi_verify_padding(page,block,&size,&wrong)) { + _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong ); + } +} + +#else + +static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) { + MI_UNUSED(page); + MI_UNUSED(block); +} + +#endif + +// only maintain stats for smaller objects if requested +#if (MI_STAT>0) +static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) { + #if (MI_STAT < 2) + MI_UNUSED(block); + #endif + mi_heap_t* const heap = mi_heap_get_default(); + const size_t bsize = mi_page_usable_block_size(page); + #if (MI_STAT>1) + const size_t usize = mi_page_usable_size_of(page, block); + mi_heap_stat_decrease(heap, malloc, usize); + #endif + if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) { + mi_heap_stat_decrease(heap, normal, bsize); + #if (MI_STAT > 1) + mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1); + #endif + } + else if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { + mi_heap_stat_decrease(heap, large, bsize); + } + else { + mi_heap_stat_decrease(heap, huge, bsize); + } +} +#else +static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) { + MI_UNUSED(page); MI_UNUSED(block); +} +#endif diff --git a/third-party/mimalloc/src/heap.c b/third-party/mimalloc/src/heap.c index 58520ddf..e498fdb2 100644 --- a/third-party/mimalloc/src/heap.c +++ b/third-party/mimalloc/src/heap.c @@ -32,7 +32,7 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void #if MI_DEBUG>1 size_t total = heap->page_count; size_t count = 0; - #endif + #endif for (size_t i = 0; i <= MI_BIN_FULL; i++) { mi_page_queue_t* pq = &heap->pages[i]; @@ -95,6 +95,11 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t mi_assert_internal(mi_heap_page_is_valid(heap, pq, page, NULL, NULL)); mi_collect_t collect = *((mi_collect_t*)arg_collect); _mi_page_free_collect(page, collect >= MI_FORCE); + if (collect == MI_FORCE) { + // note: call before a potential `_mi_page_free` as the segment may be freed if this was the last used page in that segment. + mi_segment_t* segment = _mi_page_segment(page); + _mi_segment_collect(segment, true /* force? */, &heap->tld->segments); + } if (mi_page_all_free(page)) { // no more used blocks, free the page. // note: this will free retired pages as well. @@ -120,17 +125,20 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) { if (heap==NULL || !mi_heap_is_initialized(heap)) return; - const bool force = collect >= MI_FORCE; + const bool force = (collect >= MI_FORCE); _mi_deferred_free(heap, force); - // note: never reclaim on collect but leave it to threads that need storage to reclaim - const bool force_main = + // python/cpython#112532: we may be called from a thread that is not the owner of the heap + const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id()); + + // note: never reclaim on collect but leave it to threads that need storage to reclaim + const bool force_main = #ifdef NDEBUG collect == MI_FORCE #else collect >= MI_FORCE #endif - && _mi_is_main_thread() && mi_heap_is_backing(heap) && !heap->no_reclaim; + && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim; if (force_main) { // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments. @@ -157,17 +165,14 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) // collect abandoned segments (in particular, purge expired parts of segments in the abandoned segment list) // note: forced purge can be quite expensive if many threads are created/destroyed so we do not force on abandonment _mi_abandoned_collect(heap, collect == MI_FORCE /* force? */, &heap->tld->segments); - - // collect segment local caches - if (force) { - _mi_segment_thread_collect(&heap->tld->segments); - } - - // collect regions on program-exit (or shared library unload) - if (force && _mi_is_main_thread() && mi_heap_is_backing(heap)) { + + // if forced, collect thread data cache on program-exit (or shared library unload) + if (force && is_main_thread && mi_heap_is_backing(heap)) { _mi_thread_data_collect(); // collect thread data cache - _mi_arena_collect(true /* force purge */, &heap->tld->stats); } + + // collect arenas (this is program wide so don't force purges on abandonment of threads) + _mi_arenas_collect(collect == MI_FORCE /* force purge? */, &heap->tld->stats); } void _mi_heap_collect_abandon(mi_heap_t* heap) { @@ -206,22 +211,33 @@ mi_heap_t* mi_heap_get_backing(void) { return bheap; } -mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) { - mi_heap_t* bheap = mi_heap_get_backing(); - mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode? - if (heap == NULL) return NULL; +void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) { _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t)); - heap->tld = bheap->tld; - heap->thread_id = _mi_thread_id(); - heap->arena_id = arena_id; - _mi_random_split(&bheap->random, &heap->random); - heap->cookie = _mi_heap_random_next(heap) | 1; + heap->tld = tld; + heap->thread_id = _mi_thread_id(); + heap->arena_id = arena_id; + heap->no_reclaim = noreclaim; + heap->tag = tag; + if (heap == tld->heap_backing) { + _mi_random_init(&heap->random); + } + else { + _mi_random_split(&tld->heap_backing->random, &heap->random); + } + heap->cookie = _mi_heap_random_next(heap) | 1; heap->keys[0] = _mi_heap_random_next(heap); heap->keys[1] = _mi_heap_random_next(heap); - heap->no_reclaim = true; // don't reclaim abandoned pages or otherwise destroy is unsafe // push on the thread local heaps list heap->next = heap->tld->heaps; heap->tld->heaps = heap; +} + +mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) { + mi_heap_t* bheap = mi_heap_get_backing(); + mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode? + if (heap == NULL) return NULL; + // don't reclaim abandoned pages or otherwise destroy is unsafe + _mi_heap_init(heap, bheap->tld, arena_id, true /* no reclaim */, 0 /* default tag */); return heap; } @@ -279,6 +295,18 @@ static void mi_heap_free(mi_heap_t* heap) { mi_free(heap); } +// return a heap on the same thread as `heap` specialized for the specified tag (if it exists) +mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag) { + if (heap->tag == tag) { + return heap; + } + for (mi_heap_t *curr = heap->tld->heaps; curr != NULL; curr = curr->next) { + if (curr->tag == tag) { + return curr; + } + } + return NULL; +} /* ----------------------------------------------------------- Heap destroy @@ -425,7 +453,7 @@ void mi_heap_delete(mi_heap_t* heap) if (heap==NULL || !mi_heap_is_initialized(heap)) return; if (!mi_heap_is_backing(heap)) { - // tranfer still used pages to the backing heap + // transfer still used pages to the backing heap mi_heap_absorb(heap->tld->heap_backing, heap); } else { @@ -474,8 +502,7 @@ static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_pa MI_UNUSED(heap); MI_UNUSED(pq); bool* found = (bool*)vfound; - mi_segment_t* segment = _mi_page_segment(page); - void* start = _mi_page_start(segment, page, NULL); + void* start = mi_page_start(page); void* end = (uint8_t*)start + (page->capacity * mi_page_block_size(page)); *found = (p >= start && p < end); return (!*found); // continue if not found @@ -521,7 +548,7 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v const size_t bsize = mi_page_block_size(page); const size_t ubsize = mi_page_usable_block_size(page); // without padding size_t psize; - uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize); + uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize); if (page->capacity == 1) { // optimize page with one block @@ -588,7 +615,7 @@ static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_pa xarea.page = page; xarea.area.reserved = page->reserved * bsize; xarea.area.committed = page->capacity * bsize; - xarea.area.blocks = _mi_page_start(_mi_page_segment(page), page, NULL); + xarea.area.blocks = mi_page_start(page); xarea.area.used = page->used; // number of blocks in use (#553) xarea.area.block_size = ubsize; xarea.area.full_block_size = bsize; diff --git a/third-party/mimalloc/src/init.c b/third-party/mimalloc/src/init.c index b1db14c5..6f51ca89 100644 --- a/third-party/mimalloc/src/init.c +++ b/third-party/mimalloc/src/init.c @@ -14,25 +14,27 @@ terms of the MIT license. A copy of the license can be found in the file // Empty page used to initialize the small free pages array const mi_page_t _mi_page_empty = { - 0, false, false, false, + 0, + false, false, false, false, 0, // capacity 0, // reserved capacity { 0 }, // flags false, // is_zero 0, // retire_expire NULL, // free - 0, // used - 0, // xblock_size NULL, // local_free + 0, // used + 0, // block size shift + 0, // heap tag + 0, // block_size + NULL, // page_start #if (MI_PADDING || MI_ENCODE_FREELIST) { 0, 0 }, #endif MI_ATOMIC_VAR_INIT(0), // xthread_free MI_ATOMIC_VAR_INIT(0), // xheap NULL, NULL - #if MI_INTPTR_SIZE==8 , { 0 } // padding - #endif }; #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty) @@ -84,7 +86,9 @@ const mi_page_t _mi_page_empty = { MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \ MI_STAT_COUNT_NULL(), \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ + { 0, 0 } \ MI_STAT_COUNT_END_NULL() @@ -110,8 +114,6 @@ const mi_page_t _mi_page_empty = { mi_decl_cache_align const mi_heap_t _mi_heap_empty = { NULL, - MI_SMALL_PAGES_EMPTY, - MI_PAGE_QUEUES_EMPTY, MI_ATOMIC_VAR_INIT(NULL), 0, // tid 0, // cookie @@ -121,7 +123,10 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = { 0, // page count MI_BIN_FULL, 0, // page retired min/max NULL, // next - false + false, // can reclaim + 0, // tag + MI_SMALL_PAGES_EMPTY, + MI_PAGE_QUEUES_EMPTY }; #define tld_empty_stats ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats))) @@ -131,7 +136,7 @@ mi_decl_cache_align static const mi_tld_t tld_empty = { 0, false, NULL, NULL, - { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, tld_empty_stats, tld_empty_os }, // segments + { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, tld_empty_stats, tld_empty_os }, // segments { 0, tld_empty_stats }, // os { MI_STATS_NULL } // stats }; @@ -148,15 +153,13 @@ extern mi_heap_t _mi_heap_main; static mi_tld_t tld_main = { 0, false, &_mi_heap_main, & _mi_heap_main, - { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, &tld_main.stats, &tld_main.os }, // segments + { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, &tld_main.stats, &tld_main.os }, // segments { 0, &tld_main.stats }, // os { MI_STATS_NULL } // stats }; mi_heap_t _mi_heap_main = { &tld_main, - MI_SMALL_PAGES_EMPTY, - MI_PAGE_QUEUES_EMPTY, MI_ATOMIC_VAR_INIT(NULL), 0, // thread id 0, // initial cookie @@ -166,7 +169,10 @@ mi_heap_t _mi_heap_main = { 0, // page count MI_BIN_FULL, 0, // page retired min/max NULL, // next heap - false // can reclaim + false, // can reclaim + 0, // tag + MI_SMALL_PAGES_EMPTY, + MI_PAGE_QUEUES_EMPTY }; bool _mi_process_is_initialized = false; // set to `true` in `mi_process_init`. @@ -201,9 +207,9 @@ mi_heap_t* _mi_heap_main_get(void) { // note: in x64 in release build `sizeof(mi_thread_data_t)` is under 4KiB (= OS page size). typedef struct mi_thread_data_s { - mi_heap_t heap; // must come first due to cast in `_mi_heap_done` + mi_heap_t heap; // must come first due to cast in `_mi_heap_done` mi_tld_t tld; - mi_memid_t memid; + mi_memid_t memid; // must come last due to zero'ing } mi_thread_data_t; @@ -247,9 +253,9 @@ static mi_thread_data_t* mi_thread_data_zalloc(void) { is_zero = memid.initially_zero; } } - + if (td != NULL && !is_zero) { - _mi_memzero_aligned(td, sizeof(*td)); + _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid)); } return td; } @@ -283,7 +289,7 @@ void _mi_thread_data_collect(void) { } // Initialize the thread local default heap, called from `mi_thread_init` -static bool _mi_heap_init(void) { +static bool _mi_thread_heap_init(void) { if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true; if (_mi_is_main_thread()) { // mi_assert_internal(_mi_heap_main.thread_id != 0); // can happen on freeBSD where alloc is called before any initialization @@ -299,26 +305,25 @@ static bool _mi_heap_init(void) { mi_tld_t* tld = &td->tld; mi_heap_t* heap = &td->heap; - _mi_memcpy_aligned(tld, &tld_empty, sizeof(*tld)); - _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(*heap)); - heap->thread_id = _mi_thread_id(); - _mi_random_init(&heap->random); - heap->cookie = _mi_heap_random_next(heap) | 1; - heap->keys[0] = _mi_heap_random_next(heap); - heap->keys[1] = _mi_heap_random_next(heap); - heap->tld = tld; - tld->heap_backing = heap; - tld->heaps = heap; - tld->segments.stats = &tld->stats; - tld->segments.os = &tld->os; - tld->os.stats = &tld->stats; - _mi_heap_set_default_direct(heap); + _mi_tld_init(tld, heap); // must be before `_mi_heap_init` + _mi_heap_init(heap, tld, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */); + _mi_heap_set_default_direct(heap); } return false; } +// initialize thread local data +void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { + _mi_memcpy_aligned(tld, &tld_empty, sizeof(mi_tld_t)); + tld->heap_backing = bheap; + tld->heaps = NULL; + tld->segments.stats = &tld->stats; + tld->segments.os = &tld->os; + tld->os.stats = &tld->stats; +} + // Free the thread local default heap (called from `mi_thread_done`) -static bool _mi_heap_done(mi_heap_t* heap) { +static bool _mi_thread_heap_done(mi_heap_t* heap) { if (!mi_heap_is_initialized(heap)) return true; // reset default heap @@ -415,7 +420,7 @@ void mi_thread_init(void) mi_attr_noexcept // initialize the thread local default heap // (this will call `_mi_heap_set_default_direct` and thus set the // fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called) - if (_mi_heap_init()) return; // returns true if already initialized + if (_mi_thread_heap_init()) return; // returns true if already initialized _mi_stat_increase(&_mi_stats_main.threads, 1); mi_atomic_increment_relaxed(&thread_count); @@ -426,28 +431,28 @@ void mi_thread_done(void) mi_attr_noexcept { _mi_thread_done(NULL); } -void _mi_thread_done(mi_heap_t* heap) +void _mi_thread_done(mi_heap_t* heap) { // calling with NULL implies using the default heap - if (heap == NULL) { - heap = mi_prim_get_default_heap(); + if (heap == NULL) { + heap = mi_prim_get_default_heap(); if (heap == NULL) return; } // prevent re-entrancy through heap_done/heap_set_default_direct (issue #699) if (!mi_heap_is_initialized(heap)) { - return; + return; } // adjust stats mi_atomic_decrement_relaxed(&thread_count); _mi_stat_decrease(&_mi_stats_main.threads, 1); - + // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps... if (heap->thread_id != _mi_thread_id()) return; // abandon the thread local heap - if (_mi_heap_done(heap)) return; // returns true if already ran + if (_mi_thread_heap_done(heap)) return; // returns true if already ran } void _mi_heap_set_default_direct(mi_heap_t* heap) { @@ -455,7 +460,7 @@ void _mi_heap_set_default_direct(mi_heap_t* heap) { #if defined(MI_TLS_SLOT) mi_prim_tls_slot_set(MI_TLS_SLOT,heap); #elif defined(MI_TLS_PTHREAD_SLOT_OFS) - *mi_tls_pthread_heap_slot() = heap; + *mi_prim_tls_pthread_heap_slot() = heap; #elif defined(MI_TLS_PTHREAD) // we use _mi_heap_default_key #else @@ -464,7 +469,7 @@ void _mi_heap_set_default_direct(mi_heap_t* heap) { // ensure the default heap is passed to `_mi_thread_done` // setting to a non-NULL value also ensures `mi_thread_done` is called. - _mi_prim_thread_associate_default_heap(heap); + _mi_prim_thread_associate_default_heap(heap); } @@ -624,7 +629,7 @@ static void mi_cdecl mi_process_done(void) { // release any thread specific resources and ensure _mi_thread_done is called on all but the main thread _mi_prim_thread_done_auto_done(); - + #ifndef MI_SKIP_COLLECT_ON_EXIT #if (MI_DEBUG || !defined(MI_SHARED_LIB)) // free all memory if possible on process exit. This is not needed for a stand-alone process diff --git a/third-party/mimalloc/src/libc.c b/third-party/mimalloc/src/libc.c new file mode 100644 index 00000000..dd6b4007 --- /dev/null +++ b/third-party/mimalloc/src/libc.c @@ -0,0 +1,273 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2018-2023, Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +// -------------------------------------------------------- +// This module defines various std libc functions to reduce +// the dependency on libc, and also prevent errors caused +// by some libc implementations when called before `main` +// executes (due to malloc redirection) +// -------------------------------------------------------- + +#include "mimalloc.h" +#include "mimalloc/internal.h" +#include "mimalloc/prim.h" // mi_prim_getenv + +char _mi_toupper(char c) { + if (c >= 'a' && c <= 'z') return (c - 'a' + 'A'); + else return c; +} + +int _mi_strnicmp(const char* s, const char* t, size_t n) { + if (n == 0) return 0; + for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) { + if (_mi_toupper(*s) != _mi_toupper(*t)) break; + } + return (n == 0 ? 0 : *s - *t); +} + +void _mi_strlcpy(char* dest, const char* src, size_t dest_size) { + if (dest==NULL || src==NULL || dest_size == 0) return; + // copy until end of src, or when dest is (almost) full + while (*src != 0 && dest_size > 1) { + *dest++ = *src++; + dest_size--; + } + // always zero terminate + *dest = 0; +} + +void _mi_strlcat(char* dest, const char* src, size_t dest_size) { + if (dest==NULL || src==NULL || dest_size == 0) return; + // find end of string in the dest buffer + while (*dest != 0 && dest_size > 1) { + dest++; + dest_size--; + } + // and catenate + _mi_strlcpy(dest, src, dest_size); +} + +size_t _mi_strlen(const char* s) { + if (s==NULL) return 0; + size_t len = 0; + while(s[len] != 0) { len++; } + return len; +} + +size_t _mi_strnlen(const char* s, size_t max_len) { + if (s==NULL) return 0; + size_t len = 0; + while(s[len] != 0 && len < max_len) { len++; } + return len; +} + +#ifdef MI_NO_GETENV +bool _mi_getenv(const char* name, char* result, size_t result_size) { + MI_UNUSED(name); + MI_UNUSED(result); + MI_UNUSED(result_size); + return false; +} +#else +bool _mi_getenv(const char* name, char* result, size_t result_size) { + if (name==NULL || result == NULL || result_size < 64) return false; + return _mi_prim_getenv(name,result,result_size); +} +#endif + +// -------------------------------------------------------- +// Define our own limited `_mi_vsnprintf` and `_mi_snprintf` +// This is mostly to avoid calling these when libc is not yet +// initialized (and to reduce dependencies) +// +// format: d i, p x u, s +// prec: z l ll L +// width: 10 +// align-left: - +// fill: 0 +// plus: + +// -------------------------------------------------------- + +static void mi_outc(char c, char** out, char* end) { + char* p = *out; + if (p >= end) return; + *p = c; + *out = p + 1; +} + +static void mi_outs(const char* s, char** out, char* end) { + if (s == NULL) return; + char* p = *out; + while (*s != 0 && p < end) { + *p++ = *s++; + } + *out = p; +} + +static void mi_out_fill(char fill, size_t len, char** out, char* end) { + char* p = *out; + for (size_t i = 0; i < len && p < end; i++) { + *p++ = fill; + } + *out = p; +} + +static void mi_out_alignright(char fill, char* start, size_t len, size_t extra, char* end) { + if (len == 0 || extra == 0) return; + if (start + len + extra >= end) return; + // move `len` characters to the right (in reverse since it can overlap) + for (size_t i = 1; i <= len; i++) { + start[len + extra - i] = start[len - i]; + } + // and fill the start + for (size_t i = 0; i < extra; i++) { + start[i] = fill; + } +} + + +static void mi_out_num(uintptr_t x, size_t base, char prefix, char** out, char* end) +{ + if (x == 0 || base == 0 || base > 16) { + if (prefix != 0) { mi_outc(prefix, out, end); } + mi_outc('0',out,end); + } + else { + // output digits in reverse + char* start = *out; + while (x > 0) { + char digit = (char)(x % base); + mi_outc((digit <= 9 ? '0' + digit : 'A' + digit - 10),out,end); + x = x / base; + } + if (prefix != 0) { + mi_outc(prefix, out, end); + } + size_t len = *out - start; + // and reverse in-place + for (size_t i = 0; i < (len / 2); i++) { + char c = start[len - i - 1]; + start[len - i - 1] = start[i]; + start[i] = c; + } + } +} + + +#define MI_NEXTC() c = *in; if (c==0) break; in++; + +void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) { + if (buf == NULL || bufsize == 0 || fmt == NULL) return; + buf[bufsize - 1] = 0; + char* const end = buf + (bufsize - 1); + const char* in = fmt; + char* out = buf; + while (true) { + if (out >= end) break; + char c; + MI_NEXTC(); + if (c != '%') { + if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t') { // output visible ascii or standard control only + mi_outc(c, &out, end); + } + } + else { + MI_NEXTC(); + char fill = ' '; + size_t width = 0; + char numtype = 'd'; + char numplus = 0; + bool alignright = true; + if (c == '+' || c == ' ') { numplus = c; MI_NEXTC(); } + if (c == '-') { alignright = false; MI_NEXTC(); } + if (c == '0') { fill = '0'; MI_NEXTC(); } + if (c >= '1' && c <= '9') { + width = (c - '0'); MI_NEXTC(); + while (c >= '0' && c <= '9') { + width = (10 * width) + (c - '0'); MI_NEXTC(); + } + if (c == 0) break; // extra check due to while + } + if (c == 'z' || c == 't' || c == 'L') { numtype = c; MI_NEXTC(); } + else if (c == 'l') { + numtype = c; MI_NEXTC(); + if (c == 'l') { numtype = 'L'; MI_NEXTC(); } + } + + char* start = out; + if (c == 's') { + // string + const char* s = va_arg(args, const char*); + mi_outs(s, &out, end); + } + else if (c == 'p' || c == 'x' || c == 'u') { + // unsigned + uintptr_t x = 0; + if (c == 'x' || c == 'u') { + if (numtype == 'z') x = va_arg(args, size_t); + else if (numtype == 't') x = va_arg(args, uintptr_t); // unsigned ptrdiff_t + else if (numtype == 'L') x = (uintptr_t)va_arg(args, unsigned long long); + else x = va_arg(args, unsigned long); + } + else if (c == 'p') { + x = va_arg(args, uintptr_t); + mi_outs("0x", &out, end); + start = out; + width = (width >= 2 ? width - 2 : 0); + } + if (width == 0 && (c == 'x' || c == 'p')) { + if (c == 'p') { width = 2 * (x <= UINT32_MAX ? 4 : ((x >> 16) <= UINT32_MAX ? 6 : sizeof(void*))); } + if (width == 0) { width = 2; } + fill = '0'; + } + mi_out_num(x, (c == 'x' || c == 'p' ? 16 : 10), numplus, &out, end); + } + else if (c == 'i' || c == 'd') { + // signed + intptr_t x = 0; + if (numtype == 'z') x = va_arg(args, intptr_t ); + else if (numtype == 't') x = va_arg(args, ptrdiff_t); + else if (numtype == 'L') x = (intptr_t)va_arg(args, long long); + else x = va_arg(args, long); + char pre = 0; + if (x < 0) { + pre = '-'; + if (x > INTPTR_MIN) { x = -x; } + } + else if (numplus != 0) { + pre = numplus; + } + mi_out_num((uintptr_t)x, 10, pre, &out, end); + } + else if (c >= ' ' && c <= '~') { + // unknown format + mi_outc('%', &out, end); + mi_outc(c, &out, end); + } + + // fill & align + mi_assert_internal(out <= end); + mi_assert_internal(out >= start); + const size_t len = out - start; + if (len < width) { + mi_out_fill(fill, width - len, &out, end); + if (alignright && out <= end) { + mi_out_alignright(fill, start, len, width - len, end); + } + } + } + } + mi_assert_internal(out <= end); + *out = 0; +} + +void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + _mi_vsnprintf(buf, buflen, fmt, args); + va_end(args); +} diff --git a/third-party/mimalloc/src/options.c b/third-party/mimalloc/src/options.c index 345b560e..a62727dd 100644 --- a/third-party/mimalloc/src/options.c +++ b/third-party/mimalloc/src/options.c @@ -9,9 +9,9 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc/atomic.h" #include "mimalloc/prim.h" // mi_prim_out_stderr -#include // FILE +#include // stdin/stdout #include // abort -#include + static long mi_max_error_count = 16; // stop outputting errors after this (use < 0 for no limit) @@ -65,7 +65,7 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION_LEGACY(allow_large_os_pages,large_os_pages) }, // use large OS pages, use only with eager commit to prevent fragmentation of VMA's { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) }, // per 1GiB huge pages {-1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) }, // reserve huge pages at node N - { 0, UNINIT, MI_OPTION(reserve_os_memory) }, + { 0, UNINIT, MI_OPTION(reserve_os_memory) }, // reserve N KiB OS memory in advance (use `option_get_size`) { 0, UNINIT, MI_OPTION(deprecated_segment_cache) }, // cache N segments per thread { 0, UNINIT, MI_OPTION(deprecated_page_reset) }, // reset page memory on free { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_purge,abandoned_page_reset) }, // reset free page memory when a thread terminates @@ -77,23 +77,30 @@ static mi_option_desc_t options[_mi_option_last] = #endif { 10, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. - { 0, UNINIT, MI_OPTION(limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) + { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose - { 16, UNINIT, MI_OPTION(max_errors) }, // maximum errors that are output - { 16, UNINIT, MI_OPTION(max_warnings) }, // maximum warnings that are output - { 8, UNINIT, MI_OPTION(max_segment_reclaim)}, // max. number of segment reclaims from the abandoned segments per try. + { 32, UNINIT, MI_OPTION(max_errors) }, // maximum errors that are output + { 32, UNINIT, MI_OPTION(max_warnings) }, // maximum warnings that are output + { 10, UNINIT, MI_OPTION(max_segment_reclaim)}, // max. percentage of the abandoned segments to be reclaimed per try. { 0, UNINIT, MI_OPTION(destroy_on_exit)}, // release all OS memory on process exit; careful with dangling pointer or after-exit frees! #if (MI_INTPTR_SIZE>4) - { 1024L * 1024L, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time + { 1024L*1024L, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`) #else - { 128L * 1024L, UNINIT, MI_OPTION(arena_reserve) }, + { 128L*1024L, UNINIT, MI_OPTION(arena_reserve) }, // =128MiB on 32-bit #endif { 10, UNINIT, MI_OPTION(arena_purge_mult) }, // purge delay multiplier for arena's { 1, UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) }, + { 1, UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free + { 0, UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's) + { 400, UNINIT, MI_OPTION(retry_on_oom) }, // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. }; static void mi_option_init(mi_option_desc_t* desc); +static bool mi_option_has_size_in_kib(mi_option_t option) { + return (option == mi_option_reserve_os_memory || option == mi_option_arena_reserve); +} + void _mi_options_init(void) { // called on process load; should not be called before the CRT is initialized! // (e.g. do not call this from process_init as that may run before CRT initialization) @@ -104,7 +111,7 @@ void _mi_options_init(void) { // if (option != mi_option_verbose) { mi_option_desc_t* desc = &options[option]; - _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value); + _mi_verbose_message("option '%s': %ld %s\n", desc->name, desc->value, (mi_option_has_size_in_kib(option) ? "KiB" : "")); } } mi_max_error_count = mi_option_get(mi_option_max_errors); @@ -128,9 +135,13 @@ mi_decl_nodiscard long mi_option_get_clamp(mi_option_t option, long min, long ma } mi_decl_nodiscard size_t mi_option_get_size(mi_option_t option) { - mi_assert_internal(option == mi_option_reserve_os_memory || option == mi_option_arena_reserve); - long x = mi_option_get(option); - return (x < 0 ? 0 : (size_t)x * MI_KiB); + mi_assert_internal(mi_option_has_size_in_kib(option)); + const long x = mi_option_get(option); + size_t size = (x < 0 ? 0 : (size_t)x); + if (mi_option_has_size_in_kib(option)) { + size *= MI_KiB; + } + return size; } void mi_option_set(mi_option_t option, long value) { @@ -311,12 +322,12 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me } // Define our own limited `fprintf` that avoids memory allocation. -// We do this using `snprintf` with a limited buffer. +// We do this using `_mi_vsnprintf` with a limited buffer. static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) { char buf[512]; if (fmt==NULL) return; if (!mi_recurse_enter()) return; - vsnprintf(buf,sizeof(buf)-1,fmt,args); + _mi_vsnprintf(buf, sizeof(buf)-1, fmt, args); mi_recurse_exit(); _mi_fputs(out,arg,prefix,buf); } @@ -331,7 +342,7 @@ void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) { static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args) { if (prefix != NULL && _mi_strnlen(prefix,33) <= 32 && !_mi_is_main_thread()) { char tprefix[64]; - snprintf(tprefix, sizeof(tprefix), "%sthread 0x%llx: ", prefix, (unsigned long long)_mi_thread_id()); + _mi_snprintf(tprefix, sizeof(tprefix), "%sthread 0x%tx: ", prefix, (uintptr_t)_mi_thread_id()); mi_vfprintf(out, arg, tprefix, fmt, args); } else { @@ -434,68 +445,6 @@ void _mi_error_message(int err, const char* fmt, ...) { // -------------------------------------------------------- // Initialize options by checking the environment // -------------------------------------------------------- -char _mi_toupper(char c) { - if (c >= 'a' && c <= 'z') return (c - 'a' + 'A'); - else return c; -} - -int _mi_strnicmp(const char* s, const char* t, size_t n) { - if (n == 0) return 0; - for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) { - if (_mi_toupper(*s) != _mi_toupper(*t)) break; - } - return (n == 0 ? 0 : *s - *t); -} - -void _mi_strlcpy(char* dest, const char* src, size_t dest_size) { - if (dest==NULL || src==NULL || dest_size == 0) return; - // copy until end of src, or when dest is (almost) full - while (*src != 0 && dest_size > 1) { - *dest++ = *src++; - dest_size--; - } - // always zero terminate - *dest = 0; -} - -void _mi_strlcat(char* dest, const char* src, size_t dest_size) { - if (dest==NULL || src==NULL || dest_size == 0) return; - // find end of string in the dest buffer - while (*dest != 0 && dest_size > 1) { - dest++; - dest_size--; - } - // and catenate - _mi_strlcpy(dest, src, dest_size); -} - -size_t _mi_strlen(const char* s) { - if (s==NULL) return 0; - size_t len = 0; - while(s[len] != 0) { len++; } - return len; -} - -size_t _mi_strnlen(const char* s, size_t max_len) { - if (s==NULL) return 0; - size_t len = 0; - while(s[len] != 0 && len < max_len) { len++; } - return len; -} - -#ifdef MI_NO_GETENV -static bool mi_getenv(const char* name, char* result, size_t result_size) { - MI_UNUSED(name); - MI_UNUSED(result); - MI_UNUSED(result_size); - return false; -} -#else -static bool mi_getenv(const char* name, char* result, size_t result_size) { - if (name==NULL || result == NULL || result_size < 64) return false; - return _mi_prim_getenv(name,result,result_size); -} -#endif // TODO: implement ourselves to reduce dependencies on the C runtime #include // strtol @@ -508,11 +457,11 @@ static void mi_option_init(mi_option_desc_t* desc) { char buf[64+1]; _mi_strlcpy(buf, "mimalloc_", sizeof(buf)); _mi_strlcat(buf, desc->name, sizeof(buf)); - bool found = mi_getenv(buf, s, sizeof(s)); + bool found = _mi_getenv(buf, s, sizeof(s)); if (!found && desc->legacy_name != NULL) { _mi_strlcpy(buf, "mimalloc_", sizeof(buf)); _mi_strlcat(buf, desc->legacy_name, sizeof(buf)); - found = mi_getenv(buf, s, sizeof(s)); + found = _mi_getenv(buf, s, sizeof(s)); if (found) { _mi_warning_message("environment option \"mimalloc_%s\" is deprecated -- use \"mimalloc_%s\" instead.\n", desc->legacy_name, desc->name); } @@ -535,14 +484,20 @@ static void mi_option_init(mi_option_desc_t* desc) { else { char* end = buf; long value = strtol(buf, &end, 10); - if (desc->option == mi_option_reserve_os_memory || desc->option == mi_option_arena_reserve) { - // this option is interpreted in KiB to prevent overflow of `long` + if (mi_option_has_size_in_kib(desc->option)) { + // this option is interpreted in KiB to prevent overflow of `long` for large allocations + // (long is 32-bit on 64-bit windows, which allows for 4TiB max.) + size_t size = (value < 0 ? 0 : (size_t)value); + bool overflow = false; if (*end == 'K') { end++; } - else if (*end == 'M') { value *= MI_KiB; end++; } - else if (*end == 'G') { value *= MI_MiB; end++; } - else { value = (value + MI_KiB - 1) / MI_KiB; } - if (end[0] == 'I' && end[1] == 'B') { end += 2; } - else if (*end == 'B') { end++; } + else if (*end == 'M') { overflow = mi_mul_overflow(size,MI_KiB,&size); end++; } + else if (*end == 'G') { overflow = mi_mul_overflow(size,MI_MiB,&size); end++; } + else if (*end == 'T') { overflow = mi_mul_overflow(size,MI_GiB,&size); end++; } + else { size = (size + MI_KiB - 1) / MI_KiB; } + if (end[0] == 'I' && end[1] == 'B') { end += 2; } // KiB, MiB, GiB, TiB + else if (*end == 'B') { end++; } // Kb, Mb, Gb, Tb + if (overflow || size > MI_MAX_ALLOC_SIZE) { size = (MI_MAX_ALLOC_SIZE / MI_KiB); } + value = (size > LONG_MAX ? LONG_MAX : (long)size); } if (*end == 0) { desc->value = value; diff --git a/third-party/mimalloc/src/os.c b/third-party/mimalloc/src/os.c index b4f02ba3..ce104273 100644 --- a/third-party/mimalloc/src/os.c +++ b/third-party/mimalloc/src/os.c @@ -11,9 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file /* ----------------------------------------------------------- - Initialization. - On windows initializes support for aligned allocation and - large OS pages (if MIMALLOC_LARGE_OS_PAGES is true). + Initialization. ----------------------------------------------------------- */ static mi_os_mem_config_t mi_os_mem_config = { @@ -21,7 +19,7 @@ static mi_os_mem_config_t mi_os_mem_config = { 0, // large page size (usually 2MiB) 4096, // allocation granularity true, // has overcommit? (if true we use MAP_NORESERVE on mmap systems) - false, // must free whole? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span) + false, // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span) true // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory) }; @@ -29,7 +27,7 @@ bool _mi_os_has_overcommit(void) { return mi_os_mem_config.has_overcommit; } -bool _mi_os_has_virtual_reserve(void) { +bool _mi_os_has_virtual_reserve(void) { return mi_os_mem_config.has_virtual_reserve; } @@ -73,14 +71,6 @@ void _mi_os_init(void) { bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats); bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats); -static void* mi_align_up_ptr(void* p, size_t alignment) { - return (void*)_mi_align_up((uintptr_t)p, alignment); -} - -static void* mi_align_down_ptr(void* p, size_t alignment) { - return (void*)_mi_align_down((uintptr_t)p, alignment); -} - /* ----------------------------------------------------------- aligned hinting @@ -141,13 +131,13 @@ static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats); static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_stats_t* tld_stats) { MI_UNUSED(tld_stats); + mi_stats_t* stats = &_mi_stats_main; mi_assert_internal((size % _mi_os_page_size()) == 0); if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr) int err = _mi_prim_free(addr, size); if (err != 0) { _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr); } - mi_stats_t* stats = &_mi_stats_main; if (still_committed) { _mi_stat_decrease(&stats->committed, size); } _mi_stat_decrease(&stats->reserved, size); } @@ -173,7 +163,7 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me } } else { - // nothing to do + // nothing to do mi_assert(memid.memkind < MI_MEM_OS); } } @@ -188,31 +178,33 @@ void _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* tld_stats) -------------------------------------------------------------- */ // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned. -static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* stats) { +static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* tld_stats) { mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); mi_assert_internal(is_zero != NULL); mi_assert_internal(is_large != NULL); if (size == 0) return NULL; if (!commit) { allow_large = false; } if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning - *is_zero = false; - void* p = NULL; + void* p = NULL; int err = _mi_prim_alloc(size, try_alignment, commit, allow_large, is_large, is_zero, &p); if (err != 0) { _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, size, try_alignment, commit, allow_large); } + + MI_UNUSED(tld_stats); + mi_stats_t* stats = &_mi_stats_main; mi_stat_counter_increase(stats->mmap_calls, 1); if (p != NULL) { _mi_stat_increase(&stats->reserved, size); - if (commit) { - _mi_stat_increase(&stats->committed, size); + if (commit) { + _mi_stat_increase(&stats->committed, size); // seems needed for asan (or `mimalloc-test-api` fails) #ifdef MI_TRACK_ASAN if (*is_zero) { mi_track_mem_defined(p,size); } else { mi_track_mem_undefined(p,size); } #endif - } + } } return p; } @@ -245,11 +237,11 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit if (size >= (SIZE_MAX - alignment)) return NULL; // overflow const size_t over_size = size + alignment; - if (mi_os_mem_config.must_free_whole) { // win32 virtualAlloc cannot free parts of an allocate block + if (!mi_os_mem_config.has_partial_free) { // win32 virtualAlloc cannot free parts of an allocated block // over-allocate uncommitted (virtual) memory p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero, stats); if (p == NULL) return NULL; - + // set p to the aligned part in the full region // note: this is dangerous on Windows as VirtualFree needs the actual base pointer // this is handled though by having the `base` field in the memid's @@ -265,8 +257,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit // overallocate... p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero, stats); if (p == NULL) return NULL; - - // and selectively unmap parts around the over-allocated area. (noop on sbrk) + + // and selectively unmap parts around the over-allocated area. void* aligned_p = mi_align_up_ptr(p, alignment); size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p; size_t mid_size = _mi_align_up(size, _mi_os_page_size()); @@ -274,9 +266,9 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit mi_assert_internal(pre_size < over_size&& post_size < over_size&& mid_size >= size); if (pre_size > 0) { mi_os_prim_free(p, pre_size, commit, stats); } if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats); } - // we can return the aligned pointer on `mmap` (and sbrk) systems + // we can return the aligned pointer on `mmap` systems p = aligned_p; - *base = aligned_p; // since we freed the pre part, `*base == p`. + *base = aligned_p; // since we freed the pre part, `*base == p`. } } @@ -289,10 +281,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit OS API: alloc and alloc_aligned ----------------------------------------------------------- */ -void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* tld_stats) { - MI_UNUSED(tld_stats); +void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) { *memid = _mi_memid_none(); - mi_stats_t* stats = &_mi_stats_main; if (size == 0) return NULL; size = _mi_os_good_alloc_size(size); bool os_is_large = false; @@ -300,23 +290,22 @@ void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* tld_stats) { void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero, stats); if (p != NULL) { *memid = _mi_memid_create_os(true, os_is_zero, os_is_large); - } + } return p; } -void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats) +void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats) { MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings - MI_UNUSED(tld_stats); *memid = _mi_memid_none(); if (size == 0) return NULL; size = _mi_os_good_alloc_size(size); alignment = _mi_align_up(alignment, _mi_os_page_size()); - + bool os_is_large = false; bool os_is_zero = false; void* os_base = NULL; - void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base, &_mi_stats_main /*tld->stats*/ ); + void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base, stats ); if (p != NULL) { *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large); memid->mem.os.base = os_base; @@ -327,13 +316,13 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo /* ----------------------------------------------------------- OS aligned allocation with an offset. This is used - for large alignments > MI_ALIGNMENT_MAX. We use a large mimalloc + for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc page where the object can be aligned at an offset from the start of the segment. As we may need to overallocate, we need to free such pointers using `mi_free_aligned` to use the actual start of the memory region. ----------------------------------------------------------- */ -void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats) { +void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats) { mi_assert(offset <= MI_SEGMENT_SIZE); mi_assert(offset <= size); mi_assert((alignment % _mi_os_page_size()) == 0); @@ -341,20 +330,20 @@ void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offse if (offset > MI_SEGMENT_SIZE) return NULL; if (offset == 0) { // regular aligned allocation - return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld_stats); + return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, stats); } else { // overallocate to align at an offset const size_t extra = _mi_align_up(offset, alignment) - offset; const size_t oversize = size + extra; - void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid, tld_stats); + void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid, stats); if (start == NULL) return NULL; void* const p = (uint8_t*)start + extra; mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment)); // decommit the overallocation at the start if (commit && extra > _mi_os_page_size()) { - _mi_os_decommit(start, extra, tld_stats); + _mi_os_decommit(start, extra, stats); } return p; } @@ -390,7 +379,7 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t* bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) { MI_UNUSED(tld_stats); - mi_stats_t* stats = &_mi_stats_main; + mi_stats_t* stats = &_mi_stats_main; if (is_zero != NULL) { *is_zero = false; } _mi_stat_increase(&stats->committed, size); // use size for precise commit vs. decommit _mi_stat_counter_increase(&stats->commit_calls, 1); @@ -400,21 +389,21 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats void* start = mi_os_page_align_areax(false /* conservative? */, addr, size, &csize); if (csize == 0) return true; - // commit + // commit bool os_is_zero = false; - int err = _mi_prim_commit(start, csize, &os_is_zero); + int err = _mi_prim_commit(start, csize, &os_is_zero); if (err != 0) { _mi_warning_message("cannot commit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize); return false; } - if (os_is_zero && is_zero != NULL) { + if (os_is_zero && is_zero != NULL) { *is_zero = true; mi_assert_expensive(mi_mem_is_zero(start, csize)); } // note: the following seems required for asan (otherwise `mimalloc-test-stress` fails) #ifdef MI_TRACK_ASAN if (os_is_zero) { mi_track_mem_defined(start,csize); } - else { mi_track_mem_undefined(start,csize); } + else { mi_track_mem_undefined(start,csize); } #endif return true; } @@ -428,11 +417,11 @@ static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_ // page align size_t csize; void* start = mi_os_page_align_area_conservative(addr, size, &csize); - if (csize == 0) return true; + if (csize == 0) return true; // decommit *needs_recommit = true; - int err = _mi_prim_decommit(start,csize,needs_recommit); + int err = _mi_prim_decommit(start,csize,needs_recommit); if (err != 0) { _mi_warning_message("cannot decommit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize); } @@ -450,7 +439,7 @@ bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) { // but may be used later again. This will release physical memory // pages and reduce swapping while keeping the memory committed. // We page align to a conservative area inside the range to reset. -bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) { +bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) { // page align conservatively within the range size_t csize; void* start = mi_os_page_align_area_conservative(addr, size, &csize); @@ -470,7 +459,7 @@ bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) { } -// either resets or decommits memory, returns true if the memory needs +// either resets or decommits memory, returns true if the memory needs // to be recommitted if it is to be re-used later on. bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats) { @@ -483,7 +472,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats) { bool needs_recommit = true; mi_os_decommit_ex(p, size, &needs_recommit, stats); - return needs_recommit; + return needs_recommit; } else { if (allow_reset) { // this can sometimes be not allowed if the range is not fully committed @@ -493,7 +482,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats) } } -// either resets or decommits memory, returns true if the memory needs +// either resets or decommits memory, returns true if the memory needs // to be recommitted if it is to be re-used later on. bool _mi_os_purge(void* p, size_t size, mi_stats_t * stats) { return _mi_os_purge_ex(p, size, true, stats); diff --git a/third-party/mimalloc/src/page-queue.c b/third-party/mimalloc/src/page-queue.c index cb54b374..ceea91ee 100644 --- a/third-party/mimalloc/src/page-queue.c +++ b/third-party/mimalloc/src/page-queue.c @@ -1,5 +1,5 @@ /*---------------------------------------------------------------------------- -Copyright (c) 2018-2020, Microsoft Research, Daan Leijen +Copyright (c) 2018-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. @@ -11,6 +11,10 @@ terms of the MIT license. A copy of the license can be found in the file #ifndef MI_IN_PAGE_C #error "this file should be included from 'page.c'" +// include to help an IDE +#include "mimalloc.h" +#include "mimalloc/internal.h" +#include "mimalloc/atomic.h" #endif /* ----------------------------------------------------------- @@ -109,10 +113,10 @@ size_t _mi_bin_size(uint8_t bin) { // Good size for allocation size_t mi_good_size(size_t size) mi_attr_noexcept { if (size <= MI_MEDIUM_OBJ_SIZE_MAX) { - return _mi_bin_size(mi_bin(size)); + return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE)); } else { - return _mi_align_up(size,_mi_os_page_size()); + return _mi_align_up(size + MI_PADDING_SIZE,_mi_os_page_size()); } } @@ -137,21 +141,25 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t* } #endif -static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) { - uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size)); - mi_heap_t* heap = mi_page_heap(page); - mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL); - mi_page_queue_t* pq = &heap->pages[bin]; - mi_assert_internal(bin >= MI_BIN_HUGE || page->xblock_size == pq->block_size); - mi_assert_expensive(mi_page_queue_contains(pq, page)); - return pq; +static inline bool mi_page_is_large_or_huge(const mi_page_t* page) { + return (mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_huge(page)); } static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) { - uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size)); + mi_assert_internal(heap!=NULL); + uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page)))); mi_assert_internal(bin <= MI_BIN_FULL); mi_page_queue_t* pq = &heap->pages[bin]; - mi_assert_internal(mi_page_is_in_full(page) || page->xblock_size == pq->block_size); + mi_assert_internal((mi_page_block_size(page) == pq->block_size) || + (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(pq)) || + (mi_page_is_in_full(page) && mi_page_queue_is_full(pq))); + return pq; +} + +static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) { + mi_heap_t* heap = mi_page_heap(page); + mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page); + mi_assert_expensive(mi_page_queue_contains(pq, page)); return pq; } @@ -206,7 +214,9 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) { static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) { mi_assert_internal(page != NULL); mi_assert_expensive(mi_page_queue_contains(queue, page)); - mi_assert_internal(page->xblock_size == queue->block_size || (page->xblock_size > MI_MEDIUM_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue)) || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue))); + mi_assert_internal(mi_page_block_size(page) == queue->block_size || + (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(queue)) || + (mi_page_is_in_full(page) && mi_page_queue_is_full(queue))); mi_heap_t* heap = mi_page_heap(page); if (page->prev != NULL) page->prev->next = page->next; @@ -232,8 +242,8 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_ #if MI_HUGE_PAGE_ABANDON mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE); #endif - mi_assert_internal(page->xblock_size == queue->block_size || - (page->xblock_size > MI_MEDIUM_OBJ_SIZE_MAX) || + mi_assert_internal(mi_page_block_size(page) == queue->block_size || + (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(queue)) || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue))); mi_page_set_in_full(page, mi_page_queue_is_full(queue)); @@ -259,12 +269,13 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro mi_assert_internal(page != NULL); mi_assert_expensive(mi_page_queue_contains(from, page)); mi_assert_expensive(!mi_page_queue_contains(to, page)); - - mi_assert_internal((page->xblock_size == to->block_size && page->xblock_size == from->block_size) || - (page->xblock_size == to->block_size && mi_page_queue_is_full(from)) || - (page->xblock_size == from->block_size && mi_page_queue_is_full(to)) || - (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) || - (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to))); + const size_t bsize = mi_page_block_size(page); + MI_UNUSED(bsize); + mi_assert_internal((bsize == to->block_size && bsize == from->block_size) || + (bsize == to->block_size && mi_page_queue_is_full(from)) || + (bsize == from->block_size && mi_page_queue_is_full(to)) || + (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(to)) || + (mi_page_is_large_or_huge(page) && mi_page_queue_is_full(to))); mi_heap_t* heap = mi_page_heap(page); if (page->prev != NULL) page->prev->next = page->next; diff --git a/third-party/mimalloc/src/page.c b/third-party/mimalloc/src/page.c index 8ac0a715..871ed215 100644 --- a/third-party/mimalloc/src/page.c +++ b/third-party/mimalloc/src/page.c @@ -1,5 +1,5 @@ /*---------------------------------------------------------------------------- -Copyright (c) 2018-2020, Microsoft Research, Daan Leijen +Copyright (c) 2018-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. @@ -59,7 +59,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) { static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) { size_t psize; - uint8_t* page_area = _mi_page_start(_mi_page_segment(page), page, &psize); + uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize); mi_block_t* start = (mi_block_t*)page_area; mi_block_t* end = (mi_block_t*)(page_area + psize); while(p != NULL) { @@ -78,14 +78,13 @@ static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) { } static bool mi_page_is_valid_init(mi_page_t* page) { - mi_assert_internal(page->xblock_size > 0); + mi_assert_internal(mi_page_block_size(page) > 0); mi_assert_internal(page->used <= page->capacity); mi_assert_internal(page->capacity <= page->reserved); - mi_segment_t* segment = _mi_page_segment(page); - uint8_t* start = _mi_page_start(segment,page,NULL); - mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL)); - //const size_t bsize = mi_page_block_size(page); + uint8_t* start = mi_page_start(page); + mi_assert_internal(start == _mi_segment_page_start(_mi_page_segment(page), page, NULL)); + mi_assert_internal(page->is_huge == (_mi_page_segment(page)->kind == MI_SEGMENT_HUGE)); //mi_assert_internal(start + page->capacity*page->block_size == page->top); mi_assert_internal(mi_page_list_is_valid(page,page->free)); @@ -125,9 +124,9 @@ bool _mi_page_is_valid(mi_page_t* page) { mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == mi_page_heap(page)->thread_id); #if MI_HUGE_PAGE_ABANDON - if (segment->kind != MI_SEGMENT_HUGE) + if (segment->kind != MI_SEGMENT_HUGE) #endif - { + { mi_page_queue_t* pq = mi_page_queue_of(page); mi_assert_internal(mi_page_queue_contains(pq, page)); mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_in_full(page)); @@ -193,8 +192,8 @@ static void _mi_page_thread_free_collect(mi_page_t* page) if (head == NULL) return; // find the tail -- also to get a proper count (without data races) - uint32_t max_count = page->capacity; // cannot collect more than capacity - uint32_t count = 1; + size_t max_count = page->capacity; // cannot collect more than capacity + size_t count = 1; mi_block_t* tail = head; mi_block_t* next; while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) { @@ -212,7 +211,7 @@ static void _mi_page_thread_free_collect(mi_page_t* page) page->local_free = head; // update counts now - page->used -= count; + page->used -= (uint16_t)count; } void _mi_page_free_collect(mi_page_t* page, bool force) { @@ -263,7 +262,7 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) { #if MI_HUGE_PAGE_ABANDON mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE); #endif - + // TODO: push on full queue immediately if it is full? mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page)); mi_page_queue_push(heap, pq, page); @@ -282,11 +281,13 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue) return NULL; } + #if MI_HUGE_PAGE_ABANDON + mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE); + #endif mi_assert_internal(page_alignment >0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || _mi_page_segment(page)->kind != MI_SEGMENT_HUGE); - mi_assert_internal(pq!=NULL || page->xblock_size != 0); mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size); // a fresh page was found, initialize it - const size_t full_block_size = ((pq == NULL || mi_page_queue_is_huge(pq)) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc + const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc mi_assert_internal(full_block_size >= block_size); mi_page_init(heap, page, full_block_size, heap->tld); mi_heap_stat_increase(heap, pages, 1); @@ -427,8 +428,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) { _mi_segment_page_free(page, force, segments_tld); } -// Retire parameters -#define MI_MAX_RETIRE_SIZE (MI_MEDIUM_OBJ_SIZE_MAX) +#define MI_MAX_RETIRE_SIZE MI_MEDIUM_OBJ_SIZE_MAX // should be less than size for MI_BIN_HUGE #define MI_RETIRE_CYCLES (16) // Retire a page with no more used blocks @@ -441,7 +441,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept { mi_assert_internal(page != NULL); mi_assert_expensive(_mi_page_is_valid(page)); mi_assert_internal(mi_page_all_free(page)); - + mi_page_set_has_aligned(page, false); // don't retire too often.. @@ -451,10 +451,11 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept { // how to check this efficiently though... // for now, we don't retire if it is the only page left of this size class. mi_page_queue_t* pq = mi_page_queue_of(page); - if mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_queue_is_special(pq)) { // not too large && not full or huge queue? + const size_t bsize = mi_page_block_size(page); + if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) { // not full or huge queue? if (pq->last==page && pq->first==page) { // the only page in the queue? mi_stat_counter_increase(_mi_stats_main.page_no_retire,1); - page->retire_expire = 1 + (page->xblock_size <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4); + page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4); mi_heap_t* heap = mi_page_heap(page); mi_assert_internal(pq >= heap->pages); const size_t index = pq - heap->pages; @@ -462,7 +463,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept { if (index < heap->page_retired_min) heap->page_retired_min = index; if (index > heap->page_retired_max) heap->page_retired_max = index; mi_assert_internal(mi_page_all_free(page)); - return; // dont't free after all + return; // don't free after all } } _mi_page_free(page, pq, false); @@ -516,7 +517,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co #endif mi_assert_internal(page->capacity + extend <= page->reserved); mi_assert_internal(bsize == mi_page_block_size(page)); - void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL); + void* const page_area = mi_page_start(page); // initialize a randomized free list // set up `slice_count` slices to alternate between @@ -574,7 +575,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co #endif mi_assert_internal(page->capacity + extend <= page->reserved); mi_assert_internal(bsize == mi_page_block_size(page)); - void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL ); + void* const page_area = mi_page_start(page); mi_block_t* const start = mi_page_block_at(page, page_area, bsize, page->capacity); @@ -608,7 +609,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co // allocations but this did not speed up any benchmark (due to an // extra test in malloc? or cache effects?) static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) { - MI_UNUSED(tld); + MI_UNUSED(tld); mi_assert_expensive(mi_page_is_valid_init(page)); #if (MI_SECURE<=2) mi_assert(page->free == NULL); @@ -617,16 +618,14 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) #endif if (page->capacity >= page->reserved) return; - size_t page_size; - _mi_page_start(_mi_page_segment(page), page, &page_size); mi_stat_counter_increase(tld->stats.pages_extended, 1); // calculate the extend count - const size_t bsize = (page->xblock_size < MI_HUGE_BLOCK_SIZE ? page->xblock_size : page_size); + const size_t bsize = mi_page_block_size(page); size_t extend = page->reserved - page->capacity; mi_assert_internal(extend > 0); - size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)bsize); + size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/bsize); if (max_extend < MI_MIN_EXTEND) { max_extend = MI_MIN_EXTEND; } mi_assert_internal(max_extend > 0); @@ -660,11 +659,10 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi mi_assert_internal(block_size > 0); // set fields mi_page_set_heap(page, heap); - page->xblock_size = (block_size < MI_HUGE_BLOCK_SIZE ? (uint32_t)block_size : MI_HUGE_BLOCK_SIZE); // initialize before _mi_segment_page_start + page->block_size = block_size; size_t page_size; - const void* page_start = _mi_segment_page_start(segment, page, &page_size); - MI_UNUSED(page_start); - mi_track_mem_noaccess(page_start,page_size); + page->page_start = _mi_segment_page_start(segment, page, &page_size); + mi_track_mem_noaccess(page->page_start,page_size); mi_assert_internal(mi_page_block_size(page) <= page_size); mi_assert_internal(page_size <= page->slice_count*MI_SEGMENT_SLICE_SIZE); mi_assert_internal(page_size / block_size < (1L<<16)); @@ -677,12 +675,18 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi page->free_is_zero = page->is_zero_init; #if MI_DEBUG>2 if (page->is_zero_init) { - mi_track_mem_defined(page_start, page_size); - mi_assert_expensive(mi_mem_is_zero(page_start, page_size)); + mi_track_mem_defined(page->page_start, page_size); + mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size)); } #endif - mi_assert_internal(page->is_committed); + if (block_size > 0 && _mi_is_power_of_two(block_size)) { + page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size)); + } + else { + page->block_size_shift = 0; + } + mi_assert_internal(page->capacity == 0); mi_assert_internal(page->free == NULL); mi_assert_internal(page->used == 0); @@ -695,6 +699,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi mi_assert_internal(page->keys[0] != 0); mi_assert_internal(page->keys[1] != 0); #endif + mi_assert_internal(page->block_size_shift == 0 || (block_size == ((size_t)1 << page->block_size_shift))); mi_assert_expensive(mi_page_is_valid_init(page)); // initialize an initial free list @@ -718,7 +723,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p while (page != NULL) { mi_page_t* next = page->next; // remember next - #if MI_STAT + #if MI_STAT count++; #endif @@ -820,11 +825,9 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex ----------------------------------------------------------- */ // Large and huge page allocation. -// Huge pages are allocated directly without being in a queue. -// Because huge pages contain just one block, and the segment contains -// just that page, we always treat them as abandoned and any thread -// that frees the block can free the whole page and segment directly. -// Huge pages are also use if the requested alignment is very large (> MI_ALIGNMENT_MAX). +// Huge pages contain just one block, and the segment contains just that page (as `MI_SEGMENT_HUGE`). +// Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX) +// so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`. static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) { size_t block_size = _mi_os_good_alloc_size(size); mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0); @@ -832,25 +835,26 @@ static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t #if MI_HUGE_PAGE_ABANDON mi_page_queue_t* pq = (is_huge ? NULL : mi_page_queue(heap, block_size)); #else - mi_page_queue_t* pq = mi_page_queue(heap, is_huge ? MI_HUGE_BLOCK_SIZE : block_size); // not block_size as that can be low if the page_alignment > 0 + mi_page_queue_t* pq = mi_page_queue(heap, is_huge ? MI_LARGE_OBJ_SIZE_MAX+1 : block_size); mi_assert_internal(!is_huge || mi_page_queue_is_huge(pq)); #endif mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment); if (page != NULL) { mi_assert_internal(mi_page_immediate_available(page)); - + if (is_huge) { + mi_assert_internal(mi_page_is_huge(page)); mi_assert_internal(_mi_page_segment(page)->kind == MI_SEGMENT_HUGE); mi_assert_internal(_mi_page_segment(page)->used==1); #if MI_HUGE_PAGE_ABANDON mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue mi_page_set_heap(page, NULL); - #endif + #endif } else { - mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE); + mi_assert_internal(!mi_page_is_huge(page)); } - + const size_t bsize = mi_page_usable_block_size(page); // note: not `mi_page_block_size` to account for padding if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { mi_heap_stat_increase(heap, large, bsize); @@ -869,9 +873,9 @@ static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed. static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept { // huge allocation? - const size_t req_size = size - MI_PADDING_SIZE; // correct for padding_size in case of an overflow on `size` + const size_t req_size = size - MI_PADDING_SIZE; // correct for padding_size in case of an overflow on `size` if mi_unlikely(req_size > (MI_MEDIUM_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) { - if mi_unlikely(req_size > PTRDIFF_MAX) { // we don't allocate more than PTRDIFF_MAX (see ) + if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) { _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size); return NULL; } @@ -882,7 +886,7 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignme else { // otherwise find a page with free blocks in our size segregated queues #if MI_PADDING - mi_assert_internal(size >= MI_PADDING_SIZE); + mi_assert_internal(size >= MI_PADDING_SIZE); #endif return mi_find_free_page(heap, size); } @@ -898,7 +902,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al // initialize if necessary if mi_unlikely(!mi_heap_is_initialized(heap)) { - heap = mi_heap_get_default(); // calls mi_thread_init + heap = mi_heap_get_default(); // calls mi_thread_init if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; } } mi_assert_internal(mi_heap_is_initialized(heap)); @@ -926,14 +930,14 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al mi_assert_internal(mi_page_block_size(page) >= size); // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc) - if mi_unlikely(zero && page->xblock_size == 0) { + if mi_unlikely(zero && page->block_size == 0) { // note: we cannot call _mi_page_malloc with zeroing for huge blocks; we zero it afterwards in that case. - void* p = _mi_page_malloc(heap, page, size, false); + void* p = _mi_page_malloc(heap, page, size); mi_assert_internal(p != NULL); _mi_memzero_aligned(p, mi_page_usable_block_size(page)); return p; } else { - return _mi_page_malloc(heap, page, size, zero); + return _mi_page_malloc_zero(heap, page, size, zero); } } diff --git a/third-party/mimalloc/src/prim/emscripten/prim.c b/third-party/mimalloc/src/prim/emscripten/prim.c new file mode 100644 index 00000000..f3797c9e --- /dev/null +++ b/third-party/mimalloc/src/prim/emscripten/prim.c @@ -0,0 +1,244 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2018-2023, Microsoft Research, Daan Leijen, Alon Zakai +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +// This file is included in `src/prim/prim.c` + +#include "mimalloc.h" +#include "mimalloc/internal.h" +#include "mimalloc/atomic.h" +#include "mimalloc/prim.h" + +// Design +// ====== +// +// mimalloc is built on top of emmalloc. emmalloc is a minimal allocator on top +// of sbrk. The reason for having three layers here is that we want mimalloc to +// be able to allocate and release system memory properly, the same way it would +// when using VirtualAlloc on Windows or mmap on POSIX, and sbrk is too limited. +// Specifically, sbrk can only go up and down, and not "skip" over regions, and +// so we end up either never freeing memory to the system, or we can get stuck +// with holes. +// +// Atm wasm generally does *not* free memory back the system: once grown, we do +// not shrink back down (https://github.com/WebAssembly/design/issues/1397). +// However, that is expected to improve +// (https://github.com/WebAssembly/memory-control/blob/main/proposals/memory-control/Overview.md) +// and so we do not want to bake those limitations in here. +// +// Even without that issue, we want our system allocator to handle holes, that +// is, it should merge freed regions and allow allocating new content there of +// the full size, etc., so that we do not waste space. That means that the +// system allocator really does need to handle the general problem of allocating +// and freeing variable-sized chunks of memory in a random order, like malloc/ +// free do. And so it makes sense to layer mimalloc on top of such an +// implementation. +// +// emmalloc makes sense for the lower level because it is small and simple while +// still fully handling merging of holes etc. It is not the most efficient +// allocator, but our assumption is that mimalloc needs to be fast while the +// system allocator underneath it is called much less frequently. +// + +//--------------------------------------------- +// init +//--------------------------------------------- + +void _mi_prim_mem_init( mi_os_mem_config_t* config) { + config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB + config->alloc_granularity = 16; + config->has_overcommit = false; + config->has_partial_free = false; + config->has_virtual_reserve = false; +} + +extern void emmalloc_free(void*); + +int _mi_prim_free(void* addr, size_t size) { + MI_UNUSED(size); + emmalloc_free(addr); + return 0; +} + + +//--------------------------------------------- +// Allocation +//--------------------------------------------- + +extern void* emmalloc_memalign(size_t alignment, size_t size); + +// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned. +int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) { + MI_UNUSED(try_alignment); MI_UNUSED(allow_large); MI_UNUSED(commit); + *is_large = false; + // TODO: Track the highest address ever seen; first uses of it are zeroes. + // That assumes no one else uses sbrk but us (they could go up, + // scribble, and then down), but we could assert on that perhaps. + *is_zero = false; + // emmalloc has a minimum alignment size. + #define MIN_EMMALLOC_ALIGN 8 + if (try_alignment < MIN_EMMALLOC_ALIGN) { + try_alignment = MIN_EMMALLOC_ALIGN; + } + void* p = emmalloc_memalign(try_alignment, size); + *addr = p; + if (p == 0) { + return ENOMEM; + } + return 0; +} + + +//--------------------------------------------- +// Commit/Reset +//--------------------------------------------- + +int _mi_prim_commit(void* addr, size_t size, bool* is_zero) { + MI_UNUSED(addr); MI_UNUSED(size); + // See TODO above. + *is_zero = false; + return 0; +} + +int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) { + MI_UNUSED(addr); MI_UNUSED(size); + *needs_recommit = false; + return 0; +} + +int _mi_prim_reset(void* addr, size_t size) { + MI_UNUSED(addr); MI_UNUSED(size); + return 0; +} + +int _mi_prim_protect(void* addr, size_t size, bool protect) { + MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect); + return 0; +} + + +//--------------------------------------------- +// Huge pages and NUMA nodes +//--------------------------------------------- + +int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) { + MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node); + *is_zero = true; + *addr = NULL; + return ENOSYS; +} + +size_t _mi_prim_numa_node(void) { + return 0; +} + +size_t _mi_prim_numa_node_count(void) { + return 1; +} + + +//---------------------------------------------------------------- +// Clock +//---------------------------------------------------------------- + +#include + +mi_msecs_t _mi_prim_clock_now(void) { + return emscripten_date_now(); +} + + +//---------------------------------------------------------------- +// Process info +//---------------------------------------------------------------- + +void _mi_prim_process_info(mi_process_info_t* pinfo) +{ + // use defaults + MI_UNUSED(pinfo); +} + + +//---------------------------------------------------------------- +// Output +//---------------------------------------------------------------- + +#include + +void _mi_prim_out_stderr( const char* msg) { + emscripten_console_error(msg); +} + + +//---------------------------------------------------------------- +// Environment +//---------------------------------------------------------------- + +bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { + // For code size reasons, do not support environ customization for now. + MI_UNUSED(name); + MI_UNUSED(result); + MI_UNUSED(result_size); + return false; +} + + +//---------------------------------------------------------------- +// Random +//---------------------------------------------------------------- + +bool _mi_prim_random_buf(void* buf, size_t buf_len) { + int err = getentropy(buf, buf_len); + return !err; +} + + +//---------------------------------------------------------------- +// Thread init/done +//---------------------------------------------------------------- + +#ifdef __EMSCRIPTEN_SHARED_MEMORY__ + +// use pthread local storage keys to detect thread ending +// (and used with MI_TLS_PTHREADS for the default heap) +pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1); + +static void mi_pthread_done(void* value) { + if (value!=NULL) { + _mi_thread_done((mi_heap_t*)value); + } +} + +void _mi_prim_thread_init_auto_done(void) { + mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1)); + pthread_key_create(&_mi_heap_default_key, &mi_pthread_done); +} + +void _mi_prim_thread_done_auto_done(void) { + // nothing to do +} + +void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { + if (_mi_heap_default_key != (pthread_key_t)(-1)) { // can happen during recursive invocation on freeBSD + pthread_setspecific(_mi_heap_default_key, heap); + } +} + +#else + +void _mi_prim_thread_init_auto_done(void) { + // nothing +} + +void _mi_prim_thread_done_auto_done(void) { + // nothing +} + +void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { + MI_UNUSED(heap); + +} +#endif diff --git a/third-party/mimalloc/src/prim/osx/alloc-override-zone.c b/third-party/mimalloc/src/prim/osx/alloc-override-zone.c index 0e0a99d9..1515b886 100644 --- a/third-party/mimalloc/src/prim/osx/alloc-override-zone.c +++ b/third-party/mimalloc/src/prim/osx/alloc-override-zone.c @@ -225,7 +225,9 @@ static malloc_zone_t mi_malloc_zone = { // switch to version 9+ on OSX 10.6 to support memalign. .memalign = &zone_memalign, .free_definite_size = &zone_free_definite_size, + #if defined(MAC_OS_X_VERSION_10_7) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7) .pressure_relief = &zone_pressure_relief, + #endif #if defined(MAC_OS_X_VERSION_10_14) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_14) .claimed_address = &zone_claimed_address, #endif @@ -420,6 +422,7 @@ __attribute__((constructor(0))) #else __attribute__((constructor)) // seems not supported by g++-11 on the M1 #endif +__attribute__((used)) static void _mi_macos_override_malloc(void) { malloc_zone_t* purgeable_zone = NULL; diff --git a/third-party/mimalloc/src/prim/prim.c b/third-party/mimalloc/src/prim/prim.c index 9a597d8e..3b7d3736 100644 --- a/third-party/mimalloc/src/prim/prim.c +++ b/third-party/mimalloc/src/prim/prim.c @@ -18,6 +18,9 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_USE_SBRK #include "wasi/prim.c" // memory-grow or sbrk (Wasm) +#elif defined(__EMSCRIPTEN__) +#include "emscripten/prim.c" // emmalloc_*, + pthread support + #else #include "unix/prim.c" // mmap() (Linux, macOSX, BSD, Illumnos, Haiku, DragonFly, etc.) diff --git a/third-party/mimalloc/src/prim/unix/prim.c b/third-party/mimalloc/src/prim/unix/prim.c index a9c0db60..90a4aac2 100644 --- a/third-party/mimalloc/src/prim/unix/prim.c +++ b/third-party/mimalloc/src/prim/unix/prim.c @@ -27,20 +27,27 @@ terms of the MIT license. A copy of the license can be found in the file #include // mmap #include // sysconf -#include // getenv +#include // open, close, read, access +#include #if defined(__linux__) #include - #include + #if defined(MI_NO_THP) + #include + #endif #if defined(__GLIBC__) #include // linux mmap flags #else #include #endif #elif defined(__APPLE__) + #include #include - #if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR - #include + #if !defined(TARGET_OS_OSX) || TARGET_OS_OSX // see issue #879, used to be (!TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR) + #include // VM_MAKE_TAG, VM_FLAGS_SUPERPAGE_SIZE_2MB, etc. + #endif + #if !defined(MAC_OS_X_VERSION_10_7) + #define MAC_OS_X_VERSION_10_7 1070 #endif #elif defined(__FreeBSD__) || defined(__DragonFly__) #include @@ -51,44 +58,46 @@ terms of the MIT license. A copy of the license can be found in the file #include #endif -#if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__) +#if defined(__linux__) || defined(__FreeBSD__) #define MI_HAS_SYSCALL_H #include #endif + //------------------------------------------------------------------------------------ // Use syscalls for some primitives to allow for libraries that override open/read/close etc. -// and do allocation themselves; using syscalls prevents recursion when mimalloc is +// and do allocation themselves; using syscalls prevents recursion when mimalloc is // still initializing (issue #713) +// Declare inline to avoid unused function warnings. //------------------------------------------------------------------------------------ #if defined(MI_HAS_SYSCALL_H) && defined(SYS_open) && defined(SYS_close) && defined(SYS_read) && defined(SYS_access) -static int mi_prim_open(const char* fpath, int open_flags) { +static inline int mi_prim_open(const char* fpath, int open_flags) { return syscall(SYS_open,fpath,open_flags,0); } -static ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) { +static inline ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) { return syscall(SYS_read,fd,buf,bufsize); } -static int mi_prim_close(int fd) { +static inline int mi_prim_close(int fd) { return syscall(SYS_close,fd); } -static int mi_prim_access(const char *fpath, int mode) { +static inline int mi_prim_access(const char *fpath, int mode) { return syscall(SYS_access,fpath,mode); } -#elif !defined(__APPLE__) // avoid unused warnings +#else -static int mi_prim_open(const char* fpath, int open_flags) { +static inline int mi_prim_open(const char* fpath, int open_flags) { return open(fpath,open_flags); } -static ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) { +static inline ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) { return read(fd,buf,bufsize); } -static int mi_prim_close(int fd) { +static inline int mi_prim_close(int fd) { return close(fd); } -static int mi_prim_access(const char *fpath, int mode) { +static inline int mi_prim_access(const char *fpath, int mode) { return access(fpath,mode); } @@ -121,12 +130,13 @@ static bool unix_detect_overcommit(void) { os_overcommit = (val != 0); } #else - // default: overcommit is true + // default: overcommit is true #endif return os_overcommit; } -void _mi_prim_mem_init( mi_os_mem_config_t* config ) { +void _mi_prim_mem_init( mi_os_mem_config_t* config ) +{ long psize = sysconf(_SC_PAGESIZE); if (psize > 0) { config->page_size = (size_t)psize; @@ -134,8 +144,26 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) { } config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this? config->has_overcommit = unix_detect_overcommit(); - config->must_free_whole = false; // mmap can free in parts + config->has_partial_free = true; // mmap can free in parts config->has_virtual_reserve = true; // todo: check if this true for NetBSD? (for anonymous mmap with PROT_NONE) + + // disable transparent huge pages for this process? + #if (defined(__linux__) || defined(__ANDROID__)) && defined(PR_GET_THP_DISABLE) + #if defined(MI_NO_THP) + if (true) + #else + if (!mi_option_is_enabled(mi_option_allow_large_os_pages)) // disable THP also if large OS pages are not allowed in the options + #endif + { + int val = 0; + if (prctl(PR_GET_THP_DISABLE, &val, 0, 0, 0) != 0) { + // Most likely since distros often come with always/madvise settings. + val = 1; + // Disabling only for mimalloc process rather than touching system wide settings + (void)prctl(PR_SET_THP_DISABLE, &val, 0, 0, 0); + } + } + #endif } @@ -169,12 +197,12 @@ static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int p size_t n = mi_bsr(try_alignment); if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) { // alignment is a power of 2 and 4096 <= alignment <= 1GiB p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0); - if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { + if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { int err = errno; - _mi_warning_message("unable to directly request aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, addr); + _mi_trace_message("unable to directly request aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, addr); } if (p!=MAP_FAILED) return p; - // fall back to regular mmap + // fall back to regular mmap } } #elif defined(MAP_ALIGN) // Solaris @@ -190,16 +218,16 @@ static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int p void* hint = _mi_os_get_aligned_hint(try_alignment, size); if (hint != NULL) { p = mmap(hint, size, protect_flags, flags, fd, 0); - if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { + if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { #if MI_TRACK_ENABLED // asan sometimes does not instrument errno correctly? int err = 0; #else int err = errno; #endif - _mi_warning_message("unable to directly request hinted aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, hint); + _mi_trace_message("unable to directly request hinted aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, hint); } if (p!=MAP_FAILED) return p; - // fall back to regular mmap + // fall back to regular mmap } } #endif @@ -277,7 +305,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec *is_large = true; p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd); #ifdef MAP_HUGE_1GB - if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) { + if (p == NULL && (lflags & MAP_HUGE_1GB) == MAP_HUGE_1GB) { mi_huge_pages_available = false; // don't try huge 1GiB pages again _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno); lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB); @@ -311,7 +339,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec #elif defined(__sun) if (allow_large && _mi_os_use_large_page(size, try_alignment)) { struct memcntl_mha cmd = {0}; - cmd.mha_pagesize = large_os_page_size; + cmd.mha_pagesize = _mi_os_large_page_size(); cmd.mha_cmd = MHA_MAPSIZE_VA; if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) { *is_large = true; @@ -328,9 +356,9 @@ int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_la mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); mi_assert_internal(commit || !allow_large); mi_assert_internal(try_alignment > 0); - + *is_zero = true; - int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE); + int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE); *addr = unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large); return (*addr != NULL ? 0 : errno); } @@ -358,19 +386,19 @@ int _mi_prim_commit(void* start, size_t size, bool* is_zero) { // was either from mmap PROT_NONE, or from decommit MADV_DONTNEED, but // we sometimes call commit on a range with still partially committed // memory and `mprotect` does not zero the range. - *is_zero = false; + *is_zero = false; int err = mprotect(start, size, (PROT_READ | PROT_WRITE)); - if (err != 0) { - err = errno; + if (err != 0) { + err = errno; unix_mprotect_hint(err); } return err; } int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) { - int err = 0; + int err = 0; // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE) - err = unix_madvise(start, size, MADV_DONTNEED); + err = unix_madvise(start, size, MADV_DONTNEED); #if !MI_DEBUG && !MI_SECURE *needs_recommit = false; #else @@ -382,15 +410,15 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) { *needs_recommit = true; const int fd = unix_mmap_fd(); void* p = mmap(start, size, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0); - if (p != start) { err = errno; } + if (p != start) { err = errno; } */ return err; } int _mi_prim_reset(void* start, size_t size) { - // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it + // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it // will not reduce the `rss` stats in tools like `top` even though the memory is available - // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by + // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by // default `MADV_DONTNEED` is used though. #if defined(MADV_FREE) static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE); @@ -410,7 +438,7 @@ int _mi_prim_reset(void* start, size_t size) { int _mi_prim_protect(void* start, size_t size, bool protect) { int err = mprotect(start, size, protect ? PROT_NONE : (PROT_READ | PROT_WRITE)); - if (err != 0) { err = errno; } + if (err != 0) { err = errno; } unix_mprotect_hint(err); return err; } @@ -451,7 +479,7 @@ int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bo if (err != 0) { err = errno; _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d (error: %d (0x%x))\n", numa_node, err, err); - } + } } return (*addr != NULL ? 0 : errno); } @@ -473,8 +501,6 @@ int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bo #if defined(__linux__) -#include // snprintf - size_t _mi_prim_numa_node(void) { #if defined(MI_HAS_SYSCALL_H) && defined(SYS_getcpu) unsigned long node = 0; @@ -492,7 +518,7 @@ size_t _mi_prim_numa_node_count(void) { unsigned node = 0; for(node = 0; node < 256; node++) { // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation) - snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1); + _mi_snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1); if (mi_prim_access(buf,R_OK) != 0) break; } return (node+1); @@ -568,9 +594,9 @@ mi_msecs_t _mi_prim_clock_now(void) { // low resolution timer mi_msecs_t _mi_prim_clock_now(void) { #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0) - return (mi_msecs_t)clock(); + return (mi_msecs_t)clock(); #elif (CLOCKS_PER_SEC < 1000) - return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC); + return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC); #else return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000); #endif @@ -610,7 +636,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo) pinfo->stime = timeval_secs(&rusage.ru_stime); #if !defined(__HAIKU__) pinfo->page_faults = rusage.ru_majflt; -#endif +#endif #if defined(__HAIKU__) // Haiku does not have (yet?) a way to // get these stats per process @@ -730,40 +756,30 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { // Random //---------------------------------------------------------------- -#if defined(__APPLE__) - -#include -#if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10 +#if defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_15) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_15) #include #include -#endif + bool _mi_prim_random_buf(void* buf, size_t buf_len) { - #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15 - // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf - // may fail silently on macOS. See PR #390, and - return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess); - #else - // fall back on older macOS - arc4random_buf(buf, buf_len); - return true; - #endif + // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf + // may fail silently on macOS. See PR #390, and + return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess); } #elif defined(__ANDROID__) || defined(__DragonFly__) || \ defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ - defined(__sun) + defined(__sun) || \ + (defined(__APPLE__) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7)) -#include bool _mi_prim_random_buf(void* buf, size_t buf_len) { arc4random_buf(buf, buf_len); return true; } -#elif defined(__linux__) || defined(__HAIKU__) +#elif defined(__APPLE__) || defined(__linux__) || defined(__HAIKU__) // also for old apple versions < 10.7 (issue #829) #include #include -#include #include bool _mi_prim_random_buf(void* buf, size_t buf_len) { @@ -834,7 +850,9 @@ void _mi_prim_thread_init_auto_done(void) { } void _mi_prim_thread_done_auto_done(void) { - // nothing to do + if (_mi_heap_default_key != (pthread_key_t)(-1)) { // do not leak the key, see issue #809 + pthread_key_delete(_mi_heap_default_key); + } } void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { @@ -843,7 +861,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { } } -#else +#else void _mi_prim_thread_init_auto_done(void) { // nothing diff --git a/third-party/mimalloc/src/prim/wasi/prim.c b/third-party/mimalloc/src/prim/wasi/prim.c index 50511f0b..e95f67f5 100644 --- a/third-party/mimalloc/src/prim/wasi/prim.c +++ b/third-party/mimalloc/src/prim/wasi/prim.c @@ -12,6 +12,9 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc/atomic.h" #include "mimalloc/prim.h" +#include // fputs +#include // getenv + //--------------------------------------------- // Initialize //--------------------------------------------- @@ -20,7 +23,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) { config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB config->alloc_granularity = 16; config->has_overcommit = false; - config->must_free_whole = true; + config->has_partial_free = false; config->has_virtual_reserve = false; } @@ -40,6 +43,8 @@ int _mi_prim_free(void* addr, size_t size ) { //--------------------------------------------- #if defined(MI_USE_SBRK) + #include // for sbrk + static void* mi_memory_grow( size_t size ) { void* p = sbrk(size); if (p == (void*)(-1)) return NULL; diff --git a/third-party/mimalloc/src/prim/windows/prim.c b/third-party/mimalloc/src/prim/windows/prim.c index e6b61079..5074ad4c 100644 --- a/third-party/mimalloc/src/prim/windows/prim.c +++ b/third-party/mimalloc/src/prim/windows/prim.c @@ -112,7 +112,7 @@ static bool win_enable_large_os_pages(size_t* large_page_size) void _mi_prim_mem_init( mi_os_mem_config_t* config ) { config->has_overcommit = false; - config->must_free_whole = true; + config->has_partial_free = false; config->has_virtual_reserve = true; // get the page size SYSTEM_INFO si; @@ -178,7 +178,7 @@ int _mi_prim_free(void* addr, size_t size ) { // VirtualAlloc //--------------------------------------------- -static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignment, DWORD flags) { +static void* win_virtual_alloc_prim_once(void* addr, size_t size, size_t try_alignment, DWORD flags) { #if (MI_INTPTR_SIZE >= 8) // on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations if (addr == NULL) { @@ -200,13 +200,53 @@ static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignmen param.Arg.Pointer = &reqs; void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, ¶m, 1); if (p != NULL) return p; - _mi_warning_message("unable to allocate aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags); + _mi_warning_message("unable to allocate aligned OS memory (0x%zx bytes, error code: 0x%x, address: %p, alignment: 0x%zx, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags); // fall through on error } // last resort return VirtualAlloc(addr, size, flags, PAGE_READWRITE); } +static bool win_is_out_of_memory_error(DWORD err) { + switch (err) { + case ERROR_COMMITMENT_MINIMUM: + case ERROR_COMMITMENT_LIMIT: + case ERROR_PAGEFILE_QUOTA: + case ERROR_NOT_ENOUGH_MEMORY: + return true; + default: + return false; + } +} + +static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignment, DWORD flags) { + long max_retry_msecs = mi_option_get_clamp(mi_option_retry_on_oom, 0, 2000); // at most 2 seconds + if (max_retry_msecs == 1) { max_retry_msecs = 100; } // if one sets the option to "true" + for (long tries = 1; tries <= 10; tries++) { // try at most 10 times (=2200ms) + void* p = win_virtual_alloc_prim_once(addr, size, try_alignment, flags); + if (p != NULL) { + // success, return the address + return p; + } + else if (max_retry_msecs > 0 && (try_alignment <= 2*MI_SEGMENT_ALIGN) && + (flags&MEM_COMMIT) != 0 && (flags&MEM_LARGE_PAGES) == 0 && + win_is_out_of_memory_error(GetLastError())) { + // if committing regular memory and being out-of-memory, + // keep trying for a bit in case memory frees up after all. See issue #894 + _mi_warning_message("out-of-memory on OS allocation, try again... (attempt %lu, 0x%zx bytes, error code: 0x%x, address: %p, alignment: 0x%zx, flags: 0x%x)\n", tries, size, GetLastError(), addr, try_alignment, flags); + long sleep_msecs = tries*40; // increasing waits + if (sleep_msecs > max_retry_msecs) { sleep_msecs = max_retry_msecs; } + max_retry_msecs -= sleep_msecs; + Sleep(sleep_msecs); + } + else { + // otherwise return with an error + break; + } + } + return NULL; +} + static void* win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) { mi_assert_internal(!(large_only && !allow_large)); static _Atomic(size_t) large_page_try_ok; // = 0; @@ -482,7 +522,7 @@ void _mi_prim_out_stderr( const char* msg ) // on windows with redirection, the C runtime cannot handle locale dependent output // after the main thread closes so we use direct console output. if (!_mi_preloading()) { - // _cputs(msg); // _cputs cannot be used at is aborts if it fails to lock the console + // _cputs(msg); // _cputs cannot be used as it aborts when failing to lock the console static HANDLE hcon = INVALID_HANDLE_VALUE; static bool hconIsConsole; if (hcon == INVALID_HANDLE_VALUE) { @@ -572,6 +612,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) { #if !defined(MI_SHARED_LIB) // use thread local storage keys to detect thread ending +// note: another design could be to use special linker sections (see issue #869) #include #if (_WIN32_WINNT < 0x600) // before Windows Vista WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback ); diff --git a/third-party/mimalloc/src/segment-map.c b/third-party/mimalloc/src/segment-map.c index 4c2104bd..1efb1e23 100644 --- a/third-party/mimalloc/src/segment-map.c +++ b/third-party/mimalloc/src/segment-map.c @@ -16,7 +16,9 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc/internal.h" #include "mimalloc/atomic.h" -#if (MI_INTPTR_SIZE==8) +#if (MI_INTPTR_SIZE>=8) && MI_TRACK_ASAN +#define MI_MAX_ADDRESS ((size_t)140 << 40) // 140TB (see issue #881) +#elif (MI_INTPTR_SIZE >= 8) #define MI_MAX_ADDRESS ((size_t)40 << 40) // 40TB (to include huge page areas) #else #define MI_MAX_ADDRESS ((size_t)2 << 30) // 2Gb @@ -29,6 +31,7 @@ terms of the MIT license. A copy of the license can be found in the file static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1]; // 2KiB per TB with 64MiB segments static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) { + // note: segment can be invalid or NULL. mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE? if ((uintptr_t)segment >= MI_MAX_ADDRESS) { *bitidx = 0; @@ -70,8 +73,7 @@ void _mi_segment_map_freed_at(const mi_segment_t* segment) { // Determine the segment belonging to a pointer or NULL if it is not in a valid segment. static mi_segment_t* _mi_segment_of(const void* p) { if (p == NULL) return NULL; - mi_segment_t* segment = _mi_ptr_segment(p); - mi_assert_internal(segment != NULL); + mi_segment_t* segment = _mi_ptr_segment(p); // segment can be NULL size_t bitidx; size_t index = mi_segment_map_index_of(segment, &bitidx); // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge diff --git a/third-party/mimalloc/src/segment.c b/third-party/mimalloc/src/segment.c index 28685f21..4e4dcb80 100644 --- a/third-party/mimalloc/src/segment.c +++ b/third-party/mimalloc/src/segment.c @@ -1,5 +1,5 @@ /* ---------------------------------------------------------------------------- -Copyright (c) 2018-2020, Microsoft Research, Daan Leijen +Copyright (c) 2018-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. @@ -11,13 +11,17 @@ terms of the MIT license. A copy of the license can be found in the file #include // memset #include -#define MI_PAGE_HUGE_ALIGN (256*1024) +// ------------------------------------------------------------------- +// Segments +// mimalloc pages reside in segments. See `mi_segment_valid` for invariants. +// ------------------------------------------------------------------- + static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t* stats); // ------------------------------------------------------------------- -// commit mask +// commit mask // ------------------------------------------------------------------- static bool mi_commit_mask_all_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) { @@ -146,10 +150,6 @@ size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx) { /* -------------------------------------------------------------------------------- Segment allocation - - If a thread ends, it "abandons" pages with used blocks - and there is an abandoned segment list whose segments can - be reclaimed by still running threads, much like work-stealing. -------------------------------------------------------------------------------- */ @@ -212,7 +212,7 @@ static void mi_span_queue_push(mi_span_queue_t* sq, mi_slice_t* slice) { sq->first = slice; if (slice->next != NULL) slice->next->prev = slice; else sq->last = slice; - slice->xblock_size = 0; // free + slice->block_size = 0; // free } static mi_span_queue_t* mi_span_queue_for(size_t slice_count, mi_segments_tld_t* tld) { @@ -223,7 +223,7 @@ static mi_span_queue_t* mi_span_queue_for(size_t slice_count, mi_segments_tld_t* } static void mi_span_queue_delete(mi_span_queue_t* sq, mi_slice_t* slice) { - mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0); + mi_assert_internal(slice->block_size==0 && slice->slice_count>0 && slice->slice_offset==0); // should work too if the queue does not contain slice (which can happen during reclaim) if (slice->prev != NULL) slice->prev->next = slice->next; if (slice == sq->first) sq->first = slice->next; @@ -231,7 +231,7 @@ static void mi_span_queue_delete(mi_span_queue_t* sq, mi_slice_t* slice) { if (slice == sq->last) sq->last = slice->prev; slice->prev = NULL; slice->next = NULL; - slice->xblock_size = 1; // no more free + slice->block_size = 1; // no more free } @@ -240,7 +240,7 @@ static void mi_span_queue_delete(mi_span_queue_t* sq, mi_slice_t* slice) { ----------------------------------------------------------- */ static bool mi_slice_is_used(const mi_slice_t* slice) { - return (slice->xblock_size > 0); + return (slice->block_size > 0); } @@ -268,19 +268,20 @@ static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) { mi_assert_internal(slice->slice_offset == 0); size_t index = mi_slice_index(slice); size_t maxindex = (index + slice->slice_count >= segment->slice_entries ? segment->slice_entries : index + slice->slice_count) - 1; - if (mi_slice_is_used(slice)) { // a page in use, we need at least MAX_SLICE_OFFSET valid back offsets + if (mi_slice_is_used(slice)) { // a page in use, we need at least MAX_SLICE_OFFSET_COUNT valid back offsets used_count++; - for (size_t i = 0; i <= MI_MAX_SLICE_OFFSET && index + i <= maxindex; i++) { + mi_assert_internal(slice->is_huge == (segment->kind == MI_SEGMENT_HUGE)); + for (size_t i = 0; i <= MI_MAX_SLICE_OFFSET_COUNT && index + i <= maxindex; i++) { mi_assert_internal(segment->slices[index + i].slice_offset == i*sizeof(mi_slice_t)); mi_assert_internal(i==0 || segment->slices[index + i].slice_count == 0); - mi_assert_internal(i==0 || segment->slices[index + i].xblock_size == 1); + mi_assert_internal(i==0 || segment->slices[index + i].block_size == 1); } // and the last entry as well (for coalescing) const mi_slice_t* last = slice + slice->slice_count - 1; if (last > slice && last < mi_segment_slices_end(segment)) { mi_assert_internal(last->slice_offset == (slice->slice_count-1)*sizeof(mi_slice_t)); mi_assert_internal(last->slice_count == 0); - mi_assert_internal(last->xblock_size == 1); + mi_assert_internal(last->block_size == 1); } } else { // free range of slices; only last slice needs a valid back offset @@ -289,7 +290,7 @@ static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) { mi_assert_internal((uint8_t*)slice == (uint8_t*)last - last->slice_offset); } mi_assert_internal(slice == last || last->slice_count == 0 ); - mi_assert_internal(last->xblock_size == 0 || (segment->kind==MI_SEGMENT_HUGE && last->xblock_size==1)); + mi_assert_internal(last->block_size == 0 || (segment->kind==MI_SEGMENT_HUGE && last->block_size==1)); if (segment->kind != MI_SEGMENT_HUGE && segment->thread_id != 0) { // segment is not huge or abandoned sq = mi_span_queue_for(slice->slice_count,tld); mi_assert_internal(mi_span_queue_contains(sq,slice)); @@ -311,38 +312,46 @@ static size_t mi_segment_info_size(mi_segment_t* segment) { return segment->segment_info_slices * MI_SEGMENT_SLICE_SIZE; } -static uint8_t* _mi_segment_page_start_from_slice(const mi_segment_t* segment, const mi_slice_t* slice, size_t xblock_size, size_t* page_size) +static uint8_t* _mi_segment_page_start_from_slice(const mi_segment_t* segment, const mi_slice_t* slice, size_t block_size, size_t* page_size) { - ptrdiff_t idx = slice - segment->slices; - size_t psize = (size_t)slice->slice_count * MI_SEGMENT_SLICE_SIZE; + const ptrdiff_t idx = slice - segment->slices; + const size_t psize = (size_t)slice->slice_count * MI_SEGMENT_SLICE_SIZE; + uint8_t* const pstart = (uint8_t*)segment + (idx*MI_SEGMENT_SLICE_SIZE); // make the start not OS page aligned for smaller blocks to avoid page/cache effects - // note: the offset must always be an xblock_size multiple since we assume small allocations + // note: the offset must always be a block_size multiple since we assume small allocations // are aligned (see `mi_heap_malloc_aligned`). size_t start_offset = 0; - if (xblock_size >= MI_INTPTR_SIZE) { - if (xblock_size <= 64) { start_offset = 3*xblock_size; } - else if (xblock_size <= 512) { start_offset = xblock_size; } + if (block_size > 0 && block_size <= MI_MAX_ALIGN_GUARANTEE) { + // for small objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore) + const size_t adjust = block_size - ((uintptr_t)pstart % block_size); + if (adjust < block_size && psize >= block_size + adjust) { + start_offset += adjust; + } + } + if (block_size >= MI_INTPTR_SIZE) { + if (block_size <= 64) { start_offset += 3*block_size; } + else if (block_size <= 512) { start_offset += block_size; } } if (page_size != NULL) { *page_size = psize - start_offset; } - return (uint8_t*)segment + ((idx*MI_SEGMENT_SLICE_SIZE) + start_offset); + return (pstart + start_offset); } // Start of the page available memory; can be used on uninitialized pages uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) { const mi_slice_t* slice = mi_page_to_slice((mi_page_t*)page); - uint8_t* p = _mi_segment_page_start_from_slice(segment, slice, page->xblock_size, page_size); - mi_assert_internal(page->xblock_size > 0 || _mi_ptr_page(p) == page); + uint8_t* p = _mi_segment_page_start_from_slice(segment, slice, mi_page_block_size(page), page_size); + mi_assert_internal(mi_page_block_size(page) > 0 || _mi_ptr_page(p) == page); mi_assert_internal(_mi_ptr_segment(p) == segment); return p; } -static size_t mi_segment_calculate_slices(size_t required, size_t* pre_size, size_t* info_slices) { +static size_t mi_segment_calculate_slices(size_t required, size_t* info_slices) { size_t page_size = _mi_os_page_size(); size_t isize = _mi_align_up(sizeof(mi_segment_t), page_size); size_t guardsize = 0; - + if (MI_SECURE>0) { // in secure mode, we set up a protected page in between the segment info // and the page data (and one at the end of the segment) @@ -352,10 +361,9 @@ static size_t mi_segment_calculate_slices(size_t required, size_t* pre_size, siz } } - if (pre_size != NULL) *pre_size = isize; isize = _mi_align_up(isize + guardsize, MI_SEGMENT_SLICE_SIZE); if (info_slices != NULL) *info_slices = isize / MI_SEGMENT_SLICE_SIZE; - size_t segment_size = (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + guardsize, MI_SEGMENT_SLICE_SIZE) ); + size_t segment_size = (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + guardsize, MI_SEGMENT_SLICE_SIZE) ); mi_assert_internal(segment_size % MI_SEGMENT_SLICE_SIZE == 0); return (segment_size / MI_SEGMENT_SLICE_SIZE); } @@ -380,6 +388,10 @@ static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) { segment->thread_id = 0; _mi_segment_map_freed_at(segment); mi_segments_track_size(-((long)mi_segment_size(segment)),tld); + if (segment->was_reclaimed) { + tld->reclaim_count--; + segment->was_reclaimed = false; + } if (MI_SECURE>0) { // _mi_os_unprotect(segment, mi_segment_size(segment)); // ensure no more guard pages are set // unprotect the guard pages; we cannot just unprotect the whole segment size as part may be decommitted @@ -391,7 +403,7 @@ static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) { // purge delayed decommits now? (no, leave it to the arena) // mi_segment_try_purge(segment,true,tld->stats); - + const size_t size = mi_segment_size(segment); const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size); @@ -399,13 +411,6 @@ static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) { _mi_arena_free(segment, mi_segment_size(segment), csize, segment->memid, tld->stats); } -// called by threads that are terminating -void _mi_segment_thread_collect(mi_segments_tld_t* tld) { - MI_UNUSED(tld); - // nothing to do -} - - /* ----------------------------------------------------------- Commit/Decommit ranges ----------------------------------------------------------- */ @@ -451,7 +456,7 @@ static void mi_segment_commit_mask(mi_segment_t* segment, bool conservative, uin size_t bitidx = start / MI_COMMIT_SIZE; mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS); - + size_t bitcount = *full_size / MI_COMMIT_SIZE; // can be 0 if (bitidx + bitcount > MI_COMMIT_MASK_BITS) { _mi_warning_message("commit mask overflow: idx=%zu count=%zu start=%zx end=%zx p=0x%p size=%zu fullsize=%zu\n", bitidx, bitcount, start, end, p, size, *full_size); @@ -479,7 +484,7 @@ static bool mi_segment_commit(mi_segment_t* segment, uint8_t* p, size_t size, mi if (!_mi_os_commit(start, full_size, &is_zero, stats)) return false; mi_commit_mask_set(&segment->commit_mask, &mask); } - + // increase purge expiration when using part of delayed purges -- we assume more allocations are coming soon. if (mi_commit_mask_any_set(&segment->purge_mask, &mask)) { segment->purge_expire = _mi_clock_now() + mi_option_get(mi_option_purge_delay); @@ -498,7 +503,7 @@ static bool mi_segment_ensure_committed(mi_segment_t* segment, uint8_t* p, size_ return mi_segment_commit(segment, p, size, stats); } -static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) { +static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) { mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask)); if (!segment->allow_purge) return true; @@ -517,11 +522,11 @@ static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_ if (decommitted) { mi_commit_mask_t cmask; mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask); - _mi_stat_increase(&_mi_stats_main.committed, full_size - _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for double counting + _mi_stat_increase(&_mi_stats_main.committed, full_size - _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for double counting mi_commit_mask_clear(&segment->commit_mask, &mask); - } + } } - + // always clear any scheduled purges in our range mi_commit_mask_clear(&segment->purge_mask, &mask); return true; @@ -537,16 +542,16 @@ static void mi_segment_schedule_purge(mi_segment_t* segment, uint8_t* p, size_t // register for future purge in the purge mask uint8_t* start = NULL; size_t full_size = 0; - mi_commit_mask_t mask; + mi_commit_mask_t mask; mi_segment_commit_mask(segment, true /*conservative*/, p, size, &start, &full_size, &mask); if (mi_commit_mask_is_empty(&mask) || full_size==0) return; - + // update delayed commit - mi_assert_internal(segment->purge_expire > 0 || mi_commit_mask_is_empty(&segment->purge_mask)); + mi_assert_internal(segment->purge_expire > 0 || mi_commit_mask_is_empty(&segment->purge_mask)); mi_commit_mask_t cmask; mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask); // only purge what is committed; span_free may try to decommit more mi_commit_mask_set(&segment->purge_mask, &cmask); - mi_msecs_t now = _mi_clock_now(); + mi_msecs_t now = _mi_clock_now(); if (segment->purge_expire == 0) { // no previous purgess, initialize now segment->purge_expire = now + mi_option_get(mi_option_purge_delay); @@ -564,11 +569,11 @@ static void mi_segment_schedule_purge(mi_segment_t* segment, uint8_t* p, size_t // previous purge mask is not yet expired, increase the expiration by a bit. segment->purge_expire += mi_option_get(mi_option_purge_extend_delay); } - } + } } static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t* stats) { - if (!segment->allow_purge || mi_commit_mask_is_empty(&segment->purge_mask)) return; + if (!segment->allow_purge || segment->purge_expire == 0 || mi_commit_mask_is_empty(&segment->purge_mask)) return; mi_msecs_t now = _mi_clock_now(); if (!force && now < segment->purge_expire) return; @@ -590,19 +595,24 @@ static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t* mi_assert_internal(mi_commit_mask_is_empty(&segment->purge_mask)); } +// called from `mi_heap_collect_ex` +// this can be called per-page so it is important that try_purge has fast exit path +void _mi_segment_collect(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) { + mi_segment_try_purge(segment, force, tld->stats); +} /* ----------------------------------------------------------- Span free ----------------------------------------------------------- */ static bool mi_segment_is_abandoned(mi_segment_t* segment) { - return (segment->thread_id == 0); + return (mi_atomic_load_relaxed(&segment->thread_id) == 0); } // note: can be called on abandoned segments static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size_t slice_count, bool allow_purge, mi_segments_tld_t* tld) { mi_assert_internal(slice_index < segment->slice_entries); - mi_span_queue_t* sq = (segment->kind == MI_SEGMENT_HUGE || mi_segment_is_abandoned(segment) + mi_span_queue_t* sq = (segment->kind == MI_SEGMENT_HUGE || mi_segment_is_abandoned(segment) ? NULL : mi_span_queue_for(slice_count,tld)); if (slice_count==0) slice_count = 1; mi_assert_internal(slice_index + slice_count - 1 < segment->slice_entries); @@ -613,20 +623,22 @@ static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size mi_assert_internal(slice->slice_count == slice_count); // no overflow? slice->slice_offset = 0; if (slice_count > 1) { - mi_slice_t* last = &segment->slices[slice_index + slice_count - 1]; + mi_slice_t* last = slice + slice_count - 1; + mi_slice_t* end = (mi_slice_t*)mi_segment_slices_end(segment); + if (last > end) { last = end; } last->slice_count = 0; last->slice_offset = (uint32_t)(sizeof(mi_page_t)*(slice_count - 1)); - last->xblock_size = 0; + last->block_size = 0; } // perhaps decommit if (allow_purge) { mi_segment_schedule_purge(segment, mi_slice_start(slice), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats); } - + // and push it on the free page queue (if it was not a huge page) if (sq != NULL) mi_span_queue_push( sq, slice ); - else slice->xblock_size = 0; // mark huge page as free anyways + else slice->block_size = 0; // mark huge page as free anyways } /* @@ -640,7 +652,7 @@ static void mi_segment_span_add_free(mi_slice_t* slice, mi_segments_tld_t* tld) */ static void mi_segment_span_remove_from_queue(mi_slice_t* slice, mi_segments_tld_t* tld) { - mi_assert_internal(slice->slice_count > 0 && slice->slice_offset==0 && slice->xblock_size==0); + mi_assert_internal(slice->slice_count > 0 && slice->slice_offset==0 && slice->block_size==0); mi_assert_internal(_mi_ptr_segment(slice)->kind != MI_SEGMENT_HUGE); mi_span_queue_t* sq = mi_span_queue_for(slice->slice_count, tld); mi_span_queue_delete(sq, slice); @@ -649,15 +661,15 @@ static void mi_segment_span_remove_from_queue(mi_slice_t* slice, mi_segments_tld // note: can be called on abandoned segments static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_tld_t* tld) { mi_assert_internal(slice != NULL && slice->slice_count > 0 && slice->slice_offset == 0); - mi_segment_t* segment = _mi_ptr_segment(slice); - bool is_abandoned = mi_segment_is_abandoned(segment); + mi_segment_t* const segment = _mi_ptr_segment(slice); + const bool is_abandoned = (segment->thread_id == 0); // mi_segment_is_abandoned(segment); // for huge pages, just mark as free but don't add to the queues if (segment->kind == MI_SEGMENT_HUGE) { // issue #691: segment->used can be 0 if the huge page block was freed while abandoned (reclaim will get here in that case) - mi_assert_internal((segment->used==0 && slice->xblock_size==0) || segment->used == 1); // decreased right after this call in `mi_segment_page_clear` - slice->xblock_size = 0; // mark as free anyways - // we should mark the last slice `xblock_size=0` now to maintain invariants but we skip it to + mi_assert_internal((segment->used==0 && slice->block_size==0) || segment->used == 1); // decreased right after this call in `mi_segment_page_clear` + slice->block_size = 0; // mark as free anyways + // we should mark the last slice `xblock_size=0` now to maintain invariants but we skip it to // avoid a possible cache miss (and the segment is about to be freed) return slice; } @@ -666,7 +678,7 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_ size_t slice_count = slice->slice_count; mi_slice_t* next = slice + slice->slice_count; mi_assert_internal(next <= mi_segment_slices_end(segment)); - if (next < mi_segment_slices_end(segment) && next->xblock_size==0) { + if (next < mi_segment_slices_end(segment) && next->block_size==0) { // free next block -- remove it from free and merge mi_assert_internal(next->slice_count > 0 && next->slice_offset==0); slice_count += next->slice_count; // extend @@ -675,7 +687,7 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_ if (slice > segment->slices) { mi_slice_t* prev = mi_slice_first(slice - 1); mi_assert_internal(prev >= segment->slices); - if (prev->xblock_size==0) { + if (prev->block_size==0) { // free previous slice -- remove it from free and merge mi_assert_internal(prev->slice_count > 0 && prev->slice_offset==0); slice_count += prev->slice_count; @@ -699,7 +711,7 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_ static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_index, size_t slice_count, mi_segments_tld_t* tld) { mi_assert_internal(slice_index < segment->slice_entries); mi_slice_t* const slice = &segment->slices[slice_index]; - mi_assert_internal(slice->xblock_size==0 || slice->xblock_size==1); + mi_assert_internal(slice->block_size==0 || slice->block_size==1); // commit before changing the slice data if (!mi_segment_ensure_committed(segment, _mi_segment_page_start_from_slice(segment, slice, 0, NULL), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats)) { @@ -711,20 +723,20 @@ static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_i slice->slice_count = (uint32_t)slice_count; mi_assert_internal(slice->slice_count == slice_count); const size_t bsize = slice_count * MI_SEGMENT_SLICE_SIZE; - slice->xblock_size = (uint32_t)(bsize >= MI_HUGE_BLOCK_SIZE ? MI_HUGE_BLOCK_SIZE : bsize); + slice->block_size = bsize; mi_page_t* page = mi_slice_to_page(slice); mi_assert_internal(mi_page_block_size(page) == bsize); - // set slice back pointers for the first MI_MAX_SLICE_OFFSET entries + // set slice back pointers for the first MI_MAX_SLICE_OFFSET_COUNT entries size_t extra = slice_count-1; - if (extra > MI_MAX_SLICE_OFFSET) extra = MI_MAX_SLICE_OFFSET; + if (extra > MI_MAX_SLICE_OFFSET_COUNT) extra = MI_MAX_SLICE_OFFSET_COUNT; if (slice_index + extra >= segment->slice_entries) extra = segment->slice_entries - slice_index - 1; // huge objects may have more slices than avaiable entries in the segment->slices - + mi_slice_t* slice_next = slice + 1; for (size_t i = 1; i <= extra; i++, slice_next++) { slice_next->slice_offset = (uint32_t)(sizeof(mi_slice_t)*i); slice_next->slice_count = 0; - slice_next->xblock_size = 1; + slice_next->block_size = 1; } // and also for the last one (if not set already) (the last one is needed for coalescing and for large alignments) @@ -735,11 +747,12 @@ static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_i if (last > slice) { last->slice_offset = (uint32_t)(sizeof(mi_slice_t) * (last - slice)); last->slice_count = 0; - last->xblock_size = 1; + last->block_size = 1; } - + // and initialize the page page->is_committed = true; + page->is_huge = (segment->kind == MI_SEGMENT_HUGE); segment->used++; return page; } @@ -747,7 +760,7 @@ static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_i static void mi_segment_slice_split(mi_segment_t* segment, mi_slice_t* slice, size_t slice_count, mi_segments_tld_t* tld) { mi_assert_internal(_mi_ptr_segment(slice) == segment); mi_assert_internal(slice->slice_count >= slice_count); - mi_assert_internal(slice->xblock_size > 0); // no more in free queue + mi_assert_internal(slice->block_size > 0); // no more in free queue if (slice->slice_count <= slice_count) return; mi_assert_internal(segment->kind != MI_SEGMENT_HUGE); size_t next_index = mi_slice_index(slice) + slice_count; @@ -773,7 +786,7 @@ static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_aren if (slice->slice_count > slice_count) { mi_segment_slice_split(segment, slice, slice_count, tld); } - mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->xblock_size > 0); + mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->block_size > 0); mi_page_t* page = mi_segment_span_allocate(segment, mi_slice_index(slice), slice->slice_count, tld); if (page == NULL) { // commit failed; return NULL but first restore the slice @@ -796,7 +809,7 @@ static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_aren ----------------------------------------------------------- */ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment, bool eager_delayed, mi_arena_id_t req_arena_id, - size_t* psegment_slices, size_t* ppre_size, size_t* pinfo_slices, + size_t* psegment_slices, size_t* pinfo_slices, bool commit, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) { @@ -804,7 +817,7 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment bool allow_large = (!eager_delayed && (MI_SECURE == 0)); // only allow large OS pages once we are no longer lazy size_t align_offset = 0; size_t alignment = MI_SEGMENT_ALIGN; - + if (page_alignment > 0) { // mi_assert_internal(huge_page != NULL); mi_assert_internal(page_alignment >= MI_SEGMENT_ALIGN); @@ -813,7 +826,8 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment align_offset = _mi_align_up( info_size, MI_SEGMENT_ALIGN ); const size_t extra = align_offset - info_size; // recalculate due to potential guard pages - *psegment_slices = mi_segment_calculate_slices(required + extra, ppre_size, pinfo_slices); + *psegment_slices = mi_segment_calculate_slices(required + extra, pinfo_slices); + mi_assert_internal(*psegment_slices > 0 && *psegment_slices <= UINT32_MAX); } const size_t segment_size = (*psegment_slices) * MI_SEGMENT_SLICE_SIZE; @@ -822,21 +836,21 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment return NULL; // failed to allocate } - // ensure metadata part of the segment is committed - mi_commit_mask_t commit_mask; - if (memid.initially_committed) { - mi_commit_mask_create_full(&commit_mask); + // ensure metadata part of the segment is committed + mi_commit_mask_t commit_mask; + if (memid.initially_committed) { + mi_commit_mask_create_full(&commit_mask); } - else { + else { // at least commit the info slices const size_t commit_needed = _mi_divide_up((*pinfo_slices)*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE); mi_assert_internal(commit_needed>0); - mi_commit_mask_create(0, commit_needed, &commit_mask); + mi_commit_mask_create(0, commit_needed, &commit_mask); mi_assert_internal(commit_needed*MI_COMMIT_SIZE >= (*pinfo_slices)*MI_SEGMENT_SLICE_SIZE); if (!_mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, NULL, tld->stats)) { _mi_arena_free(segment,segment_size,0,memid,tld->stats); return NULL; - } + } } mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0); @@ -847,8 +861,7 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment segment->commit_mask = commit_mask; segment->purge_expire = 0; mi_commit_mask_create_empty(&segment->purge_mask); - mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL); // tsan - + mi_segments_track_size((long)(segment_size), tld); _mi_segment_map_allocated_at(segment); return segment; @@ -859,32 +872,32 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld, mi_page_t** huge_page) { mi_assert_internal((required==0 && huge_page==NULL) || (required>0 && huge_page != NULL)); - + // calculate needed sizes first size_t info_slices; - size_t pre_size; - size_t segment_slices = mi_segment_calculate_slices(required, &pre_size, &info_slices); - + size_t segment_slices = mi_segment_calculate_slices(required, &info_slices); + mi_assert_internal(segment_slices > 0 && segment_slices <= UINT32_MAX); + // Commit eagerly only if not the first N lazy segments (to reduce impact of many threads that allocate just a little) const bool eager_delay = (// !_mi_os_has_overcommit() && // never delay on overcommit systems _mi_current_thread_count() > 1 && // do not delay for the first N threads tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay)); const bool eager = !eager_delay && mi_option_is_enabled(mi_option_eager_commit); - bool commit = eager || (required > 0); - - // Allocate the segment from the OS - mi_segment_t* segment = mi_segment_os_alloc(required, page_alignment, eager_delay, req_arena_id, - &segment_slices, &pre_size, &info_slices, commit, tld, os_tld); + bool commit = eager || (required > 0); + + // Allocate the segment from the OS + mi_segment_t* segment = mi_segment_os_alloc(required, page_alignment, eager_delay, req_arena_id, + &segment_slices, &info_slices, commit, tld, os_tld); if (segment == NULL) return NULL; - - // zero the segment info? -- not always needed as it may be zero initialized from the OS + + // zero the segment info? -- not always needed as it may be zero initialized from the OS if (!segment->memid.initially_zero) { ptrdiff_t ofs = offsetof(mi_segment_t, next); size_t prefix = offsetof(mi_segment_t, slices) - ofs; - size_t zsize = prefix + (sizeof(mi_slice_t) * (segment_slices + 1)); // one more + size_t zsize = prefix + (sizeof(mi_slice_t) * (segment_slices + 1)); // one more _mi_memzero((uint8_t*)segment + ofs, zsize); } - + // initialize the rest of the segment info const size_t slice_entries = (segment_slices > MI_SLICES_PER_SEGMENT ? MI_SLICES_PER_SEGMENT : segment_slices); segment->segment_slices = segment_slices; @@ -903,7 +916,6 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi // in secure mode, we set up a protected page in between the segment info // and the page data, and at the end of the segment. size_t os_pagesize = _mi_os_page_size(); - mi_assert_internal(mi_segment_info_size(segment) - os_pagesize >= pre_size); _mi_os_protect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize); uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize; mi_segment_ensure_committed(segment, end, os_pagesize, tld->stats); @@ -914,10 +926,10 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi // reserve first slices for segment info mi_page_t* page0 = mi_segment_span_allocate(segment, 0, info_slices, tld); - mi_assert_internal(page0!=NULL); if (page0==NULL) return NULL; // cannot fail as we always commit in advance + mi_assert_internal(page0!=NULL); if (page0==NULL) return NULL; // cannot fail as we always commit in advance mi_assert_internal(segment->used == 1); segment->used = 0; // don't count our internal slices towards usage - + // initialize initial free pages if (segment->kind == MI_SEGMENT_NORMAL) { // not a huge page mi_assert_internal(huge_page==NULL); @@ -928,7 +940,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi mi_assert_internal(mi_commit_mask_is_empty(&segment->purge_mask)); mi_assert_internal(mi_commit_mask_is_full(&segment->commit_mask)); *huge_page = mi_segment_span_allocate(segment, info_slices, segment_slices - info_slices - guard_slices, tld); - mi_assert_internal(*huge_page != NULL); // cannot fail as we commit in advance + mi_assert_internal(*huge_page != NULL); // cannot fail as we commit in advance } mi_assert_expensive(mi_segment_is_valid(segment,tld)); @@ -951,8 +963,8 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t while (slice < end) { mi_assert_internal(slice->slice_count > 0); mi_assert_internal(slice->slice_offset == 0); - mi_assert_internal(mi_slice_index(slice)==0 || slice->xblock_size == 0); // no more used pages .. - if (slice->xblock_size == 0 && segment->kind != MI_SEGMENT_HUGE) { + mi_assert_internal(mi_slice_index(slice)==0 || slice->block_size == 0); // no more used pages .. + if (slice->block_size == 0 && segment->kind != MI_SEGMENT_HUGE) { mi_segment_span_remove_from_queue(slice, tld); } #if MI_DEBUG>1 @@ -978,11 +990,11 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld); // note: can be called on abandoned pages static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld) { - mi_assert_internal(page->xblock_size > 0); + mi_assert_internal(page->block_size > 0); mi_assert_internal(mi_page_all_free(page)); mi_segment_t* segment = _mi_ptr_segment(page); mi_assert_internal(segment->used > 0); - + size_t inuse = page->capacity * mi_page_block_size(page); _mi_stat_decrease(&tld->stats->page_committed, inuse); _mi_stat_decrease(&tld->stats->pages, 1); @@ -990,18 +1002,20 @@ static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld // reset the page memory to reduce memory pressure? if (segment->allow_decommit && mi_option_is_enabled(mi_option_deprecated_page_reset)) { size_t psize; - uint8_t* start = _mi_page_start(segment, page, &psize); + uint8_t* start = _mi_segment_page_start(segment, page, &psize); _mi_os_reset(start, psize, tld->stats); } - // zero the page data, but not the segment fields + // zero the page data, but not the segment fields and heap tag page->is_zero_init = false; + uint8_t heap_tag = page->heap_tag; ptrdiff_t ofs = offsetof(mi_page_t, capacity); _mi_memzero((uint8_t*)page + ofs, sizeof(*page) - ofs); - page->xblock_size = 1; + page->block_size = 1; + page->heap_tag = heap_tag; // and free it - mi_slice_t* slice = mi_segment_span_free_coalesce(mi_page_to_slice(page), tld); + mi_slice_t* slice = mi_segment_span_free_coalesce(mi_page_to_slice(page), tld); segment->used--; // cannot assert segment valid as it is called during reclaim // mi_assert_expensive(mi_segment_is_valid(segment, tld)); @@ -1027,6 +1041,10 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld) // only abandoned pages; remove from free list and abandon mi_segment_abandon(segment,tld); } + else { + // perform delayed purges + mi_segment_try_purge(segment, false /* force? */, tld->stats); + } } @@ -1036,172 +1054,21 @@ Abandonment When threads terminate, they can leave segments with live blocks (reachable through other threads). Such segments are "abandoned" and will be reclaimed by other threads to -reuse their pages and/or free them eventually - -We maintain a global list of abandoned segments that are -reclaimed on demand. Since this is shared among threads -the implementation needs to avoid the A-B-A problem on -popping abandoned segments: -We use tagged pointers to avoid accidentally identifying -reused segments, much like stamped references in Java. -Secondly, we maintain a reader counter to avoid resetting -or decommitting segments that have a pending read operation. - -Note: the current implementation is one possible design; -another way might be to keep track of abandoned segments -in the arenas/segment_cache's. This would have the advantage of keeping -all concurrent code in one place and not needing to deal -with ABA issues. The drawback is that it is unclear how to -scan abandoned segments efficiently in that case as they -would be spread among all other segments in the arenas. ------------------------------------------------------------ */ - -// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers -// to put in a tag that increments on update to avoid the A-B-A problem. -#define MI_TAGGED_MASK MI_SEGMENT_MASK -typedef uintptr_t mi_tagged_segment_t; - -static mi_segment_t* mi_tagged_segment_ptr(mi_tagged_segment_t ts) { - return (mi_segment_t*)(ts & ~MI_TAGGED_MASK); -} - -static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_segment_t ts) { - mi_assert_internal(((uintptr_t)segment & MI_TAGGED_MASK) == 0); - uintptr_t tag = ((ts & MI_TAGGED_MASK) + 1) & MI_TAGGED_MASK; - return ((uintptr_t)segment | tag); -} - -// This is a list of visited abandoned pages that were full at the time. -// this list migrates to `abandoned` when that becomes NULL. The use of -// this list reduces contention and the rate at which segments are visited. -static mi_decl_cache_align _Atomic(mi_segment_t*) abandoned_visited; // = NULL - -// The abandoned page list (tagged as it supports pop) -static mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned; // = NULL - -// Maintain these for debug purposes (these counts may be a bit off) -static mi_decl_cache_align _Atomic(size_t) abandoned_count; -static mi_decl_cache_align _Atomic(size_t) abandoned_visited_count; - -// We also maintain a count of current readers of the abandoned list -// in order to prevent resetting/decommitting segment memory if it might -// still be read. -static mi_decl_cache_align _Atomic(size_t) abandoned_readers; // = 0 - -// Push on the visited list -static void mi_abandoned_visited_push(mi_segment_t* segment) { - mi_assert_internal(segment->thread_id == 0); - mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t,&segment->abandoned_next) == NULL); - mi_assert_internal(segment->next == NULL); - mi_assert_internal(segment->used > 0); - mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited); - do { - mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, anext); - } while (!mi_atomic_cas_ptr_weak_release(mi_segment_t, &abandoned_visited, &anext, segment)); - mi_atomic_increment_relaxed(&abandoned_visited_count); -} +reuse their pages and/or free them eventually. The +`thread_id` of such segments is 0. -// Move the visited list to the abandoned list. -static bool mi_abandoned_visited_revisit(void) -{ - // quick check if the visited list is empty - if (mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited) == NULL) return false; - - // grab the whole visited list - mi_segment_t* first = mi_atomic_exchange_ptr_acq_rel(mi_segment_t, &abandoned_visited, NULL); - if (first == NULL) return false; - - // first try to swap directly if the abandoned list happens to be NULL - mi_tagged_segment_t afirst; - mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned); - if (mi_tagged_segment_ptr(ts)==NULL) { - size_t count = mi_atomic_load_relaxed(&abandoned_visited_count); - afirst = mi_tagged_segment(first, ts); - if (mi_atomic_cas_strong_acq_rel(&abandoned, &ts, afirst)) { - mi_atomic_add_relaxed(&abandoned_count, count); - mi_atomic_sub_relaxed(&abandoned_visited_count, count); - return true; - } - } +When a block is freed in an abandoned segment, the segment +is reclaimed into that thread. - // find the last element of the visited list: O(n) - mi_segment_t* last = first; - mi_segment_t* next; - while ((next = mi_atomic_load_ptr_relaxed(mi_segment_t, &last->abandoned_next)) != NULL) { - last = next; - } - - // and atomically prepend to the abandoned list - // (no need to increase the readers as we don't access the abandoned segments) - mi_tagged_segment_t anext = mi_atomic_load_relaxed(&abandoned); - size_t count; - do { - count = mi_atomic_load_relaxed(&abandoned_visited_count); - mi_atomic_store_ptr_release(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext)); - afirst = mi_tagged_segment(first, anext); - } while (!mi_atomic_cas_weak_release(&abandoned, &anext, afirst)); - mi_atomic_add_relaxed(&abandoned_count, count); - mi_atomic_sub_relaxed(&abandoned_visited_count, count); - return true; -} - -// Push on the abandoned list. -static void mi_abandoned_push(mi_segment_t* segment) { - mi_assert_internal(segment->thread_id == 0); - mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL); - mi_assert_internal(segment->next == NULL); - mi_assert_internal(segment->used > 0); - mi_tagged_segment_t next; - mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned); - do { - mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, mi_tagged_segment_ptr(ts)); - next = mi_tagged_segment(segment, ts); - } while (!mi_atomic_cas_weak_release(&abandoned, &ts, next)); - mi_atomic_increment_relaxed(&abandoned_count); -} +Moreover, if threads are looking for a fresh segment, they +will first consider abondoned segments -- these can be found +by scanning the arena memory +(segments outside arena memoryare only reclaimed by a free). +----------------------------------------------------------- */ -// Wait until there are no more pending reads on segments that used to be in the abandoned list -// called for example from `arena.c` before decommitting +// legacy: Wait until there are no more pending reads on segments that used to be in the abandoned list void _mi_abandoned_await_readers(void) { - size_t n; - do { - n = mi_atomic_load_acquire(&abandoned_readers); - if (n != 0) mi_atomic_yield(); - } while (n != 0); -} - -// Pop from the abandoned list -static mi_segment_t* mi_abandoned_pop(void) { - mi_segment_t* segment; - // Check efficiently if it is empty (or if the visited list needs to be moved) - mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned); - segment = mi_tagged_segment_ptr(ts); - if mi_likely(segment == NULL) { - if mi_likely(!mi_abandoned_visited_revisit()) { // try to swap in the visited list on NULL - return NULL; - } - } - - // Do a pop. We use a reader count to prevent - // a segment to be decommitted while a read is still pending, - // and a tagged pointer to prevent A-B-A link corruption. - // (this is called from `region.c:_mi_mem_free` for example) - mi_atomic_increment_relaxed(&abandoned_readers); // ensure no segment gets decommitted - mi_tagged_segment_t next = 0; - ts = mi_atomic_load_acquire(&abandoned); - do { - segment = mi_tagged_segment_ptr(ts); - if (segment != NULL) { - mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next); - next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted - } - } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&abandoned, &ts, next)); - mi_atomic_decrement_relaxed(&abandoned_readers); // release reader lock - if (segment != NULL) { - mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL); - mi_atomic_decrement_relaxed(&abandoned_count); - } - return segment; + // nothing needed } /* ----------------------------------------------------------- @@ -1211,33 +1078,38 @@ static mi_segment_t* mi_abandoned_pop(void) { static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) { mi_assert_internal(segment->used == segment->abandoned); mi_assert_internal(segment->used > 0); - mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL); mi_assert_internal(segment->abandoned_visits == 0); mi_assert_expensive(mi_segment_is_valid(segment,tld)); - + // remove the free pages from the free page queues mi_slice_t* slice = &segment->slices[0]; const mi_slice_t* end = mi_segment_slices_end(segment); while (slice < end) { mi_assert_internal(slice->slice_count > 0); mi_assert_internal(slice->slice_offset == 0); - if (slice->xblock_size == 0) { // a free page + if (slice->block_size == 0) { // a free page mi_segment_span_remove_from_queue(slice,tld); - slice->xblock_size = 0; // but keep it free + slice->block_size = 0; // but keep it free } slice = slice + slice->slice_count; } // perform delayed decommits (forcing is much slower on mstress) - mi_segment_try_purge(segment, mi_option_is_enabled(mi_option_abandoned_page_purge) /* force? */, tld->stats); - + // Only abandoned segments in arena memory can be reclaimed without a free + // so if a segment is not from an arena we force purge here to be conservative. + const bool force_purge = (segment->memid.memkind != MI_MEM_ARENA) || mi_option_is_enabled(mi_option_abandoned_page_purge); + mi_segment_try_purge(segment, force_purge, tld->stats); + // all pages in the segment are abandoned; add it to the abandoned list _mi_stat_increase(&tld->stats->segments_abandoned, 1); mi_segments_track_size(-((long)mi_segment_size(segment)), tld); segment->thread_id = 0; - mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL); segment->abandoned_visits = 1; // from 0 to 1 to signify it is abandoned - mi_abandoned_push(segment); + if (segment->was_reclaimed) { + tld->reclaim_count--; + segment->was_reclaimed = false; + } + _mi_arena_segment_mark_abandoned(segment); } void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) { @@ -1247,7 +1119,7 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) { mi_segment_t* segment = _mi_page_segment(page); mi_assert_expensive(mi_segment_is_valid(segment,tld)); - segment->abandoned++; + segment->abandoned++; _mi_stat_increase(&tld->stats->pages_abandoned, 1); mi_assert_internal(segment->abandoned <= segment->used); @@ -1264,18 +1136,17 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) { static mi_slice_t* mi_slices_start_iterate(mi_segment_t* segment, const mi_slice_t** end) { mi_slice_t* slice = &segment->slices[0]; *end = mi_segment_slices_end(segment); - mi_assert_internal(slice->slice_count>0 && slice->xblock_size>0); // segment allocated page + mi_assert_internal(slice->slice_count>0 && slice->block_size>0); // segment allocated page slice = slice + slice->slice_count; // skip the first segment allocated page return slice; } // Possibly free pages and check if free space is available -static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, size_t block_size, mi_segments_tld_t* tld) +static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, size_t block_size, mi_segments_tld_t* tld) { - mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE); mi_assert_internal(mi_segment_is_abandoned(segment)); bool has_page = false; - + // for all slices const mi_slice_t* end; mi_slice_t* slice = mi_slices_start_iterate(segment, &end); @@ -1287,7 +1158,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, s mi_page_t* const page = mi_slice_to_page(slice); _mi_page_free_collect(page, false); if (mi_page_all_free(page)) { - // if this page is all free now, free it without adding to any queues (yet) + // if this page is all free now, free it without adding to any queues (yet) mi_assert_internal(page->next == NULL && page->prev==NULL); _mi_stat_decrease(&tld->stats->pages_abandoned, 1); segment->abandoned--; @@ -1297,12 +1168,10 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, s has_page = true; } } - else { - if (page->xblock_size == block_size && mi_page_has_any_available(page)) { - // a page has available free blocks of the right size - has_page = true; - } - } + else if (mi_page_block_size(page) == block_size && mi_page_has_any_available(page)) { + // a page has available free blocks of the right size + has_page = true; + } } else { // empty span @@ -1318,16 +1187,17 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, s // Reclaim an abandoned segment; returns NULL if the segment was freed // set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full. static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) { - mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL); - mi_assert_expensive(mi_segment_is_valid(segment, tld)); if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; } - - segment->thread_id = _mi_thread_id(); + // can be 0 still with abandoned_next, or already a thread id for segments outside an arena that are reclaimed on a free. + mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0 || mi_atomic_load_relaxed(&segment->thread_id) == _mi_thread_id()); + mi_atomic_store_release(&segment->thread_id, _mi_thread_id()); segment->abandoned_visits = 0; + segment->was_reclaimed = true; + tld->reclaim_count++; mi_segments_track_size((long)mi_segment_size(segment), tld); mi_assert_internal(segment->next == NULL); _mi_stat_decrease(&tld->stats->segments_abandoned, 1); - + // for all slices const mi_slice_t* end; mi_slice_t* slice = mi_slices_start_iterate(segment, &end); @@ -1343,8 +1213,13 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, mi_assert_internal(page->next == NULL && page->prev==NULL); _mi_stat_decrease(&tld->stats->pages_abandoned, 1); segment->abandoned--; - // set the heap again and allow delayed free again - mi_page_set_heap(page, heap); + // set the heap again and allow heap thread delayed free again. + mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag); // allow custom heaps to separate objects + if (target_heap == NULL) { + target_heap = heap; + _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using %u instead)\n", page->heap_tag, heap->tag ); + } + mi_page_set_heap(page, target_heap); _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set) _mi_page_free_collect(page, false); // ensure used count is up to date if (mi_page_all_free(page)) { @@ -1353,8 +1228,8 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, } else { // otherwise reclaim it into the heap - _mi_page_reclaim(heap, page); - if (requested_block_size == page->xblock_size && mi_page_has_any_available(page)) { + _mi_page_reclaim(target_heap, page); + if (requested_block_size == mi_page_block_size(page) && mi_page_has_any_available(page) && heap == target_heap) { if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; } } } @@ -1368,6 +1243,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, } mi_assert(segment->abandoned == 0); + mi_assert_expensive(mi_segment_is_valid(segment, tld)); if (segment->used == 0) { // due to page_clear mi_assert_internal(right_page_reclaimed == NULL || !(*right_page_reclaimed)); mi_segment_free(segment, false, tld); @@ -1378,23 +1254,54 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, } } +// attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`) +bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) { + if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false; // it is not abandoned + // don't reclaim more from a free than half the current segments + // this is to prevent a pure free-ing thread to start owning too many segments + if (heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) return false; + if (_mi_arena_segment_clear_abandoned(segment)) { // atomically unabandon + mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments); + mi_assert_internal(res == segment); + return (res != NULL); + } + return false; +} void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) { mi_segment_t* segment; - while ((segment = mi_abandoned_pop()) != NULL) { + mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, ¤t); + while ((segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL) { mi_segment_reclaim(segment, heap, 0, NULL, tld); } } +static long mi_segment_get_reclaim_tries(void) { + // limit the tries to 10% (default) of the abandoned segments with at least 8 and at most 1024 tries. + const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100); + if (perc <= 0) return 0; + const size_t total_count = _mi_arena_segment_abandoned_count(); + if (total_count == 0) return 0; + const size_t relative_count = (total_count > 10000 ? (total_count / 100) * perc : (total_count * perc) / 100); // avoid overflow + long max_tries = (long)(relative_count <= 1 ? 1 : (relative_count > 1024 ? 1024 : relative_count)); + if (max_tries < 8 && total_count > 8) { max_tries = 8; } + return max_tries; +} + static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slices, size_t block_size, bool* reclaimed, mi_segments_tld_t* tld) { *reclaimed = false; + long max_tries = mi_segment_get_reclaim_tries(); + if (max_tries <= 0) return NULL; + mi_segment_t* segment; - long max_tries = mi_option_get_clamp(mi_option_max_segment_reclaim, 8, 1024); // limit the work to bound allocation times - while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) { + mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, ¤t); + while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL)) + { segment->abandoned_visits++; - // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments - // and push them into the visited list and use many tries. Perhaps we can skip non-suitable ones in a better way? + // todo: should we respect numa affinity for abondoned reclaim? perhaps only for the first visit? + // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments and use many tries + // Perhaps we can skip non-suitable ones in a better way? bool is_suitable = _mi_heap_memid_is_suitable(heap, segment->memid); bool has_page = mi_segment_check_free(segment,needed_slices,block_size,tld); // try to free up pages (due to concurrent frees) if (segment->used == 0) { @@ -1406,19 +1313,19 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice mi_segment_reclaim(segment, heap, 0, NULL, tld); } else if (has_page && is_suitable) { - // found a large enough free span, or a page of the right block_size with free space + // found a large enough free span, or a page of the right block_size with free space // we return the result of reclaim (which is usually `segment`) as it might free // the segment due to concurrent frees (in which case `NULL` is returned). return mi_segment_reclaim(segment, heap, block_size, reclaimed, tld); } - else if (segment->abandoned_visits > 3 && is_suitable) { + else if (segment->abandoned_visits > 3 && is_suitable) { // always reclaim on 3rd visit to limit the abandoned queue length. mi_segment_reclaim(segment, heap, 0, NULL, tld); } else { // otherwise, push on the visited list so it gets not looked at too quickly again - mi_segment_try_purge(segment, true /* force? */, tld->stats); // force purge if needed as we may not visit soon again - mi_abandoned_visited_push(segment); + mi_segment_try_purge(segment, false /* true force? */, tld->stats); // force purge if needed as we may not visit soon again + _mi_arena_segment_mark_abandoned(segment); } } return NULL; @@ -1428,11 +1335,9 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld) { mi_segment_t* segment; - int max_tries = (force ? 16*1024 : 1024); // limit latency - if (force) { - mi_abandoned_visited_revisit(); - } - while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) { + mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, ¤t); + long max_tries = (force ? (long)_mi_arena_segment_abandoned_count() : 1024); // limit latency + while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL)) { mi_segment_check_free(segment,0,0,tld); // try to free up pages (due to concurrent frees) if (segment->used == 0) { // free the segment (by forced reclaim) to make it available to other threads. @@ -1441,10 +1346,10 @@ void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld) mi_segment_reclaim(segment, heap, 0, NULL, tld); } else { - // otherwise, purge if needed and push on the visited list + // otherwise, purge if needed and push on the visited list // note: forced purge can be expensive if many threads are destroyed/created as in mstress. mi_segment_try_purge(segment, force, tld->stats); - mi_abandoned_visited_push(segment); + _mi_arena_segment_mark_abandoned(segment); } } } @@ -1455,9 +1360,8 @@ void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld) static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_slices, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) { - mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE); mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX); - + // 1. try to reclaim an abandoned segment bool reclaimed; mi_segment_t* segment = mi_segment_try_reclaim(heap, needed_slices, block_size, &reclaimed, tld); @@ -1471,7 +1375,7 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_ return segment; } // 2. otherwise allocate a fresh segment - return mi_segment_alloc(0, 0, heap->arena_id, tld, os_tld, NULL); + return mi_segment_alloc(0, 0, heap->arena_id, tld, os_tld, NULL); } @@ -1492,7 +1396,7 @@ static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_ki // no free page, allocate a new segment and try again if (mi_segment_reclaim_or_alloc(heap, slices_needed, block_size, tld, os_tld) == NULL) { // OOM or reclaimed a good page in the heap - return NULL; + return NULL; } else { // otherwise try again @@ -1517,27 +1421,28 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment, mi_segment_t* segment = mi_segment_alloc(size,page_alignment,req_arena_id,tld,os_tld,&page); if (segment == NULL || page==NULL) return NULL; mi_assert_internal(segment->used==1); - mi_assert_internal(mi_page_block_size(page) >= size); + mi_assert_internal(mi_page_block_size(page) >= size); #if MI_HUGE_PAGE_ABANDON segment->thread_id = 0; // huge segments are immediately abandoned - #endif + #endif - // for huge pages we initialize the xblock_size as we may + // for huge pages we initialize the block_size as we may // overallocate to accommodate large alignments. size_t psize; uint8_t* start = _mi_segment_page_start(segment, page, &psize); - page->xblock_size = (psize > MI_HUGE_BLOCK_SIZE ? MI_HUGE_BLOCK_SIZE : (uint32_t)psize); - + page->block_size = psize; + mi_assert_internal(page->is_huge); + // decommit the part of the prefix of a page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE) if (page_alignment > 0 && segment->allow_decommit) { uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment); mi_assert_internal(_mi_is_aligned(aligned_p, page_alignment)); - mi_assert_internal(psize - (aligned_p - start) >= size); + mi_assert_internal(psize - (aligned_p - start) >= size); uint8_t* decommit_start = start + sizeof(mi_block_t); // for the free list ptrdiff_t decommit_size = aligned_p - decommit_start; - _mi_os_reset(decommit_start, decommit_size, &_mi_stats_main); // note: cannot use segment_decommit on huge segments + _mi_os_reset(decommit_start, decommit_size, &_mi_stats_main); // note: cannot use segment_decommit on huge segments } - + return page; } @@ -1557,7 +1462,7 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block mi_block_set_next(page, block, page->free); page->free = block; page->used--; - page->is_zero = false; + page->is_zero_init = false; mi_assert(page->used == 0); mi_tld_t* tld = heap->tld; _mi_segment_page_free(page, true, &tld->segments); @@ -1593,7 +1498,7 @@ void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_bloc ----------------------------------------------------------- */ mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) { mi_page_t* page; - if mi_unlikely(page_alignment > MI_ALIGNMENT_MAX) { + if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) { mi_assert_internal(_mi_is_power_of_two(page_alignment)); mi_assert_internal(page_alignment >= MI_SEGMENT_SIZE); if (page_alignment < MI_SEGMENT_SIZE) { page_alignment = MI_SEGMENT_SIZE; } @@ -1609,7 +1514,7 @@ mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t pag page = mi_segments_page_alloc(heap,MI_PAGE_LARGE,block_size,block_size,tld, os_tld); } else { - page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld); + page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld); } mi_assert_internal(page == NULL || _mi_heap_memid_is_suitable(heap, _mi_page_segment(page)->memid)); mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld)); diff --git a/third-party/mimalloc/src/static.c b/third-party/mimalloc/src/static.c index bc05dd72..bf025eb7 100644 --- a/third-party/mimalloc/src/static.c +++ b/third-party/mimalloc/src/static.c @@ -27,6 +27,7 @@ terms of the MIT license. A copy of the license can be found in the file #include "bitmap.c" #include "heap.c" #include "init.c" +#include "libc.c" #include "options.c" #include "os.c" #include "page.c" // includes page-queue.c diff --git a/third-party/mimalloc/src/stats.c b/third-party/mimalloc/src/stats.c index 300956ce..a9364027 100644 --- a/third-party/mimalloc/src/stats.c +++ b/third-party/mimalloc/src/stats.c @@ -9,7 +9,6 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc/atomic.h" #include "mimalloc/prim.h" -#include // snprintf #include // memset #if defined(_MSC_VER) && (_MSC_VER < 1920) @@ -146,7 +145,7 @@ static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* const int64_t pos = (n < 0 ? -n : n); if (pos < base) { if (n!=1 || suffix[0] != 'B') { // skip printing 1 B for the unit column - snprintf(buf, len, "%d %-3s", (int)n, (n==0 ? "" : suffix)); + _mi_snprintf(buf, len, "%lld %-3s", (long long)n, (n==0 ? "" : suffix)); } } else { @@ -158,8 +157,8 @@ static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* const long whole = (long)(tens/10); const long frac1 = (long)(tens%10); char unitdesc[8]; - snprintf(unitdesc, 8, "%s%s%s", magnitude, (base==1024 ? "i" : ""), suffix); - snprintf(buf, len, "%ld.%ld %-3s", whole, (frac1 < 0 ? -frac1 : frac1), unitdesc); + _mi_snprintf(unitdesc, 8, "%s%s%s", magnitude, (base==1024 ? "i" : ""), suffix); + _mi_snprintf(buf, len, "%ld.%ld %-3s", whole, (frac1 < 0 ? -frac1 : frac1), unitdesc); } _mi_fprintf(out, arg, (fmt==NULL ? "%12s" : fmt), buf); } @@ -176,13 +175,28 @@ static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out, void* ar static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg, const char* notok ) { _mi_fprintf(out, arg,"%10s:", msg); - if (unit > 0) { - mi_print_amount(stat->peak, unit, out, arg); - mi_print_amount(stat->allocated, unit, out, arg); - mi_print_amount(stat->freed, unit, out, arg); - mi_print_amount(stat->current, unit, out, arg); - mi_print_amount(unit, 1, out, arg); - mi_print_count(stat->allocated, unit, out, arg); + if (unit != 0) { + if (unit > 0) { + mi_print_amount(stat->peak, unit, out, arg); + mi_print_amount(stat->allocated, unit, out, arg); + mi_print_amount(stat->freed, unit, out, arg); + mi_print_amount(stat->current, unit, out, arg); + mi_print_amount(unit, 1, out, arg); + mi_print_count(stat->allocated, unit, out, arg); + } + else { + mi_print_amount(stat->peak, -1, out, arg); + mi_print_amount(stat->allocated, -1, out, arg); + mi_print_amount(stat->freed, -1, out, arg); + mi_print_amount(stat->current, -1, out, arg); + if (unit == -1) { + _mi_fprintf(out, arg, "%24s", ""); + } + else { + mi_print_amount(-unit, 1, out, arg); + mi_print_count((stat->allocated / -unit), 0, out, arg); + } + } if (stat->allocated > stat->freed) { _mi_fprintf(out, arg, " "); _mi_fprintf(out, arg, (notok == NULL ? "not all freed" : notok)); @@ -192,23 +206,6 @@ static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64 _mi_fprintf(out, arg, " ok\n"); } } - else if (unit<0) { - mi_print_amount(stat->peak, -1, out, arg); - mi_print_amount(stat->allocated, -1, out, arg); - mi_print_amount(stat->freed, -1, out, arg); - mi_print_amount(stat->current, -1, out, arg); - if (unit==-1) { - _mi_fprintf(out, arg, "%24s", ""); - } - else { - mi_print_amount(-unit, 1, out, arg); - mi_print_count((stat->allocated / -unit), 0, out, arg); - } - if (stat->allocated > stat->freed) - _mi_fprintf(out, arg, " not all freed!\n"); - else - _mi_fprintf(out, arg, " ok\n"); - } else { mi_print_amount(stat->peak, 1, out, arg); mi_print_amount(stat->allocated, 1, out, arg); @@ -255,7 +252,7 @@ static void mi_stats_print_bins(const mi_stat_count_t* bins, size_t max, const c if (bins[i].allocated > 0) { found = true; int64_t unit = _mi_bin_size((uint8_t)i); - snprintf(buf, 64, "%s %3lu", fmt, (long)i); + _mi_snprintf(buf, 64, "%s %3lu", fmt, (long)i); mi_stat_print(&bins[i], buf, unit, out, arg); } } @@ -341,6 +338,9 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg); mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg); mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg); + mi_stat_counter_print(&stats->arena_count, "arenas", out, arg); + mi_stat_counter_print(&stats->arena_crossover_count, "-crossover", out, arg); + mi_stat_counter_print(&stats->arena_rollback_count, "-rollback", out, arg); mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg); mi_stat_counter_print(&stats->commit_calls, "commits", out, arg); mi_stat_counter_print(&stats->reset_calls, "resets", out, arg); @@ -455,7 +455,7 @@ mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, s pinfo.page_faults = 0; _mi_prim_process_info(&pinfo); - + if (elapsed_msecs!=NULL) *elapsed_msecs = (pinfo.elapsed < 0 ? 0 : (pinfo.elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.elapsed : PTRDIFF_MAX)); if (user_msecs!=NULL) *user_msecs = (pinfo.utime < 0 ? 0 : (pinfo.utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.utime : PTRDIFF_MAX)); if (system_msecs!=NULL) *system_msecs = (pinfo.stime < 0 ? 0 : (pinfo.stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.stime : PTRDIFF_MAX)); diff --git a/third-party/mimalloc/test/main-override.cpp b/third-party/mimalloc/test/main-override.cpp index c1607b66..582f24ee 100644 --- a/third-party/mimalloc/test/main-override.cpp +++ b/third-party/mimalloc/test/main-override.cpp @@ -108,7 +108,7 @@ static void various_tests() { auto tbuf = new unsigned char[sizeof(Test)]; t = new (tbuf) Test(42); t->~Test(); - delete tbuf; + delete[] tbuf; } class Static { @@ -379,7 +379,7 @@ static void bench_alloc_large(void) { static constexpr size_t kMaxBufferSize = 25 * 1024 * 1024; std::unique_ptr buffers[kNumBuffers]; - std::random_device rd; + std::random_device rd; (void)rd; std::mt19937 gen(42); //rd()); std::uniform_int_distribution<> size_distribution(kMinBufferSize, kMaxBufferSize); std::uniform_int_distribution<> buf_number_distribution(0, kNumBuffers - 1); diff --git a/third-party/mimalloc/test/test-api.c b/third-party/mimalloc/test/test-api.c index 8dd24e1b..76101980 100644 --- a/third-party/mimalloc/test/test-api.c +++ b/third-party/mimalloc/test/test-api.c @@ -34,7 +34,7 @@ we therefore test the API over various inputs. Please add more tests :-) #include "mimalloc.h" // #include "mimalloc/internal.h" -#include "mimalloc/types.h" // for MI_DEBUG and MI_ALIGNMENT_MAX +#include "mimalloc/types.h" // for MI_DEBUG and MI_BLOCK_ALIGNMENT_MAX #include "testhelper.h" @@ -46,6 +46,11 @@ bool test_heap2(void); bool test_stl_allocator1(void); bool test_stl_allocator2(void); +bool test_stl_heap_allocator1(void); +bool test_stl_heap_allocator2(void); +bool test_stl_heap_allocator3(void); +bool test_stl_heap_allocator4(void); + bool mem_is_zero(uint8_t* p, size_t size) { if (p==NULL) return false; for (size_t i = 0; i < size; ++i) { @@ -59,7 +64,7 @@ bool mem_is_zero(uint8_t* p, size_t size) { // --------------------------------------------------------------------------- int main(void) { mi_option_disable(mi_option_verbose); - + // --------------------------------------------------- // Malloc // --------------------------------------------------- @@ -154,7 +159,7 @@ int main(void) { }; CHECK_BODY("malloc-aligned6") { bool ok = true; - for (size_t align = 1; align <= MI_ALIGNMENT_MAX && ok; align *= 2) { + for (size_t align = 1; align <= MI_BLOCK_ALIGNMENT_MAX && ok; align *= 2) { void* ps[8]; for (int i = 0; i < 8 && ok; i++) { ps[i] = mi_malloc_aligned(align*13 // size @@ -170,16 +175,16 @@ int main(void) { result = ok; }; CHECK_BODY("malloc-aligned7") { - void* p = mi_malloc_aligned(1024,MI_ALIGNMENT_MAX); + void* p = mi_malloc_aligned(1024,MI_BLOCK_ALIGNMENT_MAX); mi_free(p); - result = ((uintptr_t)p % MI_ALIGNMENT_MAX) == 0; + result = ((uintptr_t)p % MI_BLOCK_ALIGNMENT_MAX) == 0; }; CHECK_BODY("malloc-aligned8") { bool ok = true; for (int i = 0; i < 5 && ok; i++) { int n = (1 << i); - void* p = mi_malloc_aligned(1024, n * MI_ALIGNMENT_MAX); - ok = ((uintptr_t)p % (n*MI_ALIGNMENT_MAX)) == 0; + void* p = mi_malloc_aligned(1024, n * MI_BLOCK_ALIGNMENT_MAX); + ok = ((uintptr_t)p % (n*MI_BLOCK_ALIGNMENT_MAX)) == 0; mi_free(p); } result = ok; @@ -187,7 +192,7 @@ int main(void) { CHECK_BODY("malloc-aligned9") { bool ok = true; void* p[8]; - size_t sizes[8] = { 8, 512, 1024 * 1024, MI_ALIGNMENT_MAX, MI_ALIGNMENT_MAX + 1, 2 * MI_ALIGNMENT_MAX, 8 * MI_ALIGNMENT_MAX, 0 }; + size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 0 }; for (int i = 0; i < 28 && ok; i++) { int align = (1 << i); for (int j = 0; j < 8 && ok; j++) { @@ -225,6 +230,28 @@ int main(void) { result = (((uintptr_t)p % 0x100) == 0); // #602 mi_free(p); } + CHECK_BODY("mimalloc-aligned13") { + bool ok = true; + for( size_t size = 1; size <= (MI_SMALL_SIZE_MAX * 2) && ok; size++ ) { + for(size_t align = 1; align <= size && ok; align *= 2 ) { + void* p[10]; + for(int i = 0; i < 10 && ok; i++) { + p[i] = mi_malloc_aligned(size,align);; + ok = (p[i] != NULL && ((uintptr_t)(p[i]) % align) == 0); + } + for(int i = 0; i < 10 && ok; i++) { + mi_free(p[i]); + } + /* + if (ok && align <= size && ((size + MI_PADDING_SIZE) & (align-1)) == 0) { + size_t bsize = mi_good_size(size); + ok = (align <= bsize && (bsize & (align-1)) == 0); + } + */ + } + } + result = ok; + } CHECK_BODY("malloc-aligned-at1") { void* p = mi_malloc_aligned_at(48,32,0); result = (p != NULL && ((uintptr_t)(p) + 0) % 32 == 0); mi_free(p); }; @@ -295,15 +322,22 @@ int main(void) { // --------------------------------------------------- // various // --------------------------------------------------- + #if !defined(MI_TRACK_ASAN) // realpath may leak with ASAN enabled (as the ASAN allocator intercepts it) CHECK_BODY("realpath") { char* s = mi_realpath( ".", NULL ); // printf("realpath: %s\n",s); mi_free(s); }; + #endif CHECK("stl_allocator1", test_stl_allocator1()); CHECK("stl_allocator2", test_stl_allocator2()); + CHECK("stl_heap_allocator1", test_stl_heap_allocator1()); + CHECK("stl_heap_allocator2", test_stl_heap_allocator2()); + CHECK("stl_heap_allocator3", test_stl_heap_allocator3()); + CHECK("stl_heap_allocator4", test_stl_heap_allocator4()); + // --------------------------------------------------- // Done // ---------------------------------------------------[] @@ -357,3 +391,61 @@ bool test_stl_allocator2(void) { return true; #endif } + +bool test_stl_heap_allocator1(void) { +#ifdef __cplusplus + std::vector > vec; + vec.push_back(some_struct()); + vec.pop_back(); + return vec.size() == 0; +#else + return true; +#endif +} + +bool test_stl_heap_allocator2(void) { +#ifdef __cplusplus + std::vector > vec; + vec.push_back(some_struct()); + vec.pop_back(); + return vec.size() == 0; +#else + return true; +#endif +} + +bool test_stl_heap_allocator3(void) { +#ifdef __cplusplus + mi_heap_t* heap = mi_heap_new(); + bool good = false; + { + mi_heap_stl_allocator myAlloc(heap); + std::vector > vec(myAlloc); + vec.push_back(some_struct()); + vec.pop_back(); + good = vec.size() == 0; + } + mi_heap_delete(heap); + return good; +#else + return true; +#endif +} + +bool test_stl_heap_allocator4(void) { +#ifdef __cplusplus + mi_heap_t* heap = mi_heap_new(); + bool good = false; + { + mi_heap_destroy_stl_allocator myAlloc(heap); + std::vector > vec(myAlloc); + vec.push_back(some_struct()); + vec.pop_back(); + good = vec.size() == 0; + } + mi_heap_destroy(heap); + return good; +#else + return true; +#endif +} diff --git a/third-party/mimalloc/test/test-stress.c b/third-party/mimalloc/test/test-stress.c index 7b74b465..15d0e25b 100644 --- a/third-party/mimalloc/test/test-stress.c +++ b/third-party/mimalloc/test/test-stress.c @@ -37,11 +37,12 @@ static int ITER = 50; // N full iterations destructing and re-creating a // static int THREADS = 8; // more repeatable if THREADS <= #processors // static int SCALE = 100; // scaling factor -#define STRESS // undefine for leak test +#define STRESS // undefine for leak test static bool allow_large_objects = true; // allow very large objects? (set to `true` if SCALE>100) static size_t use_one_size = 0; // use single object size of `N * sizeof(uintptr_t)`? +static bool main_participates = false; // main thread participates as a worker too // #define USE_STD_MALLOC #ifdef USE_STD_MALLOC @@ -276,8 +277,8 @@ int main(int argc, char** argv) { #ifndef USE_STD_MALLOC #ifndef NDEBUG - mi_collect(true); - //mi_debug_show_arenas(); + // mi_collect(true); + mi_debug_show_arenas(true,true,true); #endif mi_stats_print(NULL); #endif @@ -301,13 +302,15 @@ static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) { thread_entry_fun = fun; DWORD* tids = (DWORD*)custom_calloc(nthreads,sizeof(DWORD)); HANDLE* thandles = (HANDLE*)custom_calloc(nthreads,sizeof(HANDLE)); - for (uintptr_t i = 0; i < nthreads; i++) { + const size_t start = (main_participates ? 1 : 0); + for (size_t i = start; i < nthreads; i++) { thandles[i] = CreateThread(0, 8*1024, &thread_entry, (void*)(i), 0, &tids[i]); } - for (size_t i = 0; i < nthreads; i++) { + if (main_participates) fun(0); // run the main thread as well + for (size_t i = start; i < nthreads; i++) { WaitForSingleObject(thandles[i], INFINITE); } - for (size_t i = 0; i < nthreads; i++) { + for (size_t i = start; i < nthreads; i++) { CloseHandle(thandles[i]); } custom_free(tids); @@ -334,11 +337,13 @@ static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) { thread_entry_fun = fun; pthread_t* threads = (pthread_t*)custom_calloc(nthreads,sizeof(pthread_t)); memset(threads, 0, sizeof(pthread_t) * nthreads); + const size_t start = (main_participates ? 1 : 0); //pthread_setconcurrency(nthreads); - for (size_t i = 0; i < nthreads; i++) { + for (size_t i = start; i < nthreads; i++) { pthread_create(&threads[i], NULL, &thread_entry, (void*)i); } - for (size_t i = 0; i < nthreads; i++) { + if (main_participates) fun(0); // run the main thread as well + for (size_t i = start; i < nthreads; i++) { pthread_join(threads[i], NULL); } custom_free(threads); diff --git a/third-party/tbb/.bazelversion b/third-party/tbb/.bazelversion index 09b254e9..21c8c7b4 100644 --- a/third-party/tbb/.bazelversion +++ b/third-party/tbb/.bazelversion @@ -1 +1 @@ -6.0.0 +7.1.1 diff --git a/third-party/tbb/.github/CODEOWNERS b/third-party/tbb/.github/CODEOWNERS new file mode 100644 index 00000000..31805797 --- /dev/null +++ b/third-party/tbb/.github/CODEOWNERS @@ -0,0 +1,7 @@ +# Lines starting with '#' are comments. +# Each line is a file pattern followed by one or more owners. + +# More details are here: https://help.github.com/articles/about-codeowners/ + +src/tbbmalloc @ldorau @lplewa @kfilipek +src/tbbmalloc_proxy @ldorau @lplewa @kfilipek diff --git a/third-party/tbb/.github/issue_labeler.yml b/third-party/tbb/.github/issue_labeler.yml new file mode 100644 index 00000000..cf956edb --- /dev/null +++ b/third-party/tbb/.github/issue_labeler.yml @@ -0,0 +1,27 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# PR template regexp's for issue labeler +bug fix: + - '\[(x|X)\]\sbug\sfix' +enhancement: + - '\[(x|X)\]\snew\sfeature' +tests: + - '\[(x|X)\]\stests' +infrastructure: + - '\[(x|X)\]\sinfrastructure' +documentation: + - '\[(x|X)\]\sdocumentation' +allocator: + - '\[(x|X)\]\sallocator' \ No newline at end of file diff --git a/third-party/tbb/.github/workflows/ci.yml b/third-party/tbb/.github/workflows/ci.yml index da95c94a..a65de622 100644 --- a/third-party/tbb/.github/workflows/ci.yml +++ b/third-party/tbb/.github/workflows/ci.yml @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023 Intel Corporation +# Copyright (c) 2021-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,6 +25,8 @@ on: - synchronize - reopened +permissions: read-all + env: BUILD_CONCURRENCY: 2 MACOS_BUILD_CONCURRENCY: 3 @@ -57,7 +59,7 @@ jobs: needs: [codespell] env: BUILD_TYPE: oss - runs-on: [ubuntu-20.04] + runs-on: [ubuntu-22.04] timeout-minutes: 10 steps: - uses: actions/checkout@v2 @@ -80,6 +82,10 @@ jobs: pages: if: ${{ github.ref == 'refs/heads/master' }} + permissions: + contents: write + pages: write + id-token: write runs-on: ubuntu-latest needs: [documentation] steps: @@ -140,7 +146,7 @@ jobs: ctest -R python_test --output-on-failure --timeout ${TEST_TIMEOUT} linux-testing: - name: ${{ matrix.os }}_${{ matrix.cxx_compiler }}_cxx${{ matrix.std }}_${{ matrix.build_type }}_preview=${{ matrix.preview }} + name: ${{ matrix.os }}_${{ matrix.cxx_compiler }}_cxx${{ matrix.std }}_${{ matrix.build_type }}_preview=${{ matrix.preview }}${{ matrix.cmake_static }} runs-on: ['${{ matrix.os }}'] timeout-minutes: 45 strategy: @@ -165,6 +171,13 @@ jobs: std: 20 build_type: debug preview: 'ON' + - os: ubuntu-22.04 + c_compiler: gcc-11 + cxx_compiler: g++-11 + std: 20 + build_type: release + preview: 'ON' + cmake_static: -DBUILD_SHARED_LIBS=OFF steps: - uses: actions/checkout@v2 - name: Run testing @@ -172,25 +185,32 @@ jobs: run: | set -x mkdir build && cd build - cmake -DCMAKE_CXX_STANDARD=${{ matrix.std }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ + cmake -DCMAKE_CXX_STANDARD=${{ matrix.std }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.cmake_static }} \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx_compiler }} -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} -DTBB_CPF=${{ matrix.preview }} .. make VERBOSE=1 -j${BUILD_CONCURRENCY} ctest --timeout ${TEST_TIMEOUT} --output-on-failure macos-testing: - name: ${{ matrix.os }}_${{ matrix.cxx_compiler }}_cxx${{ matrix.std }}_${{ matrix.build_type }}_preview=${{ matrix.preview }} + name: ${{ matrix.os }}_${{ matrix.cxx_compiler }}_cxx${{ matrix.std }}_${{ matrix.build_type }}_preview=${{ matrix.preview }}${{ matrix.cmake_static }} runs-on: ['${{ matrix.os }}'] timeout-minutes: 45 strategy: fail-fast: false matrix: include: - - os: macos-10.15 + - os: macos-12 c_compiler: clang cxx_compiler: clang++ std: 14 build_type: relwithdebinfo preview: 'ON' + - os: macos-13 + c_compiler: clang + cxx_compiler: clang++ + std: 20 + build_type: release + preview: 'ON' + cmake_static: -DBUILD_SHARED_LIBS=OFF steps: - uses: actions/checkout@v2 - name: Run testing @@ -198,7 +218,7 @@ jobs: run: | set -x mkdir build && cd build - cmake -DCMAKE_CXX_STANDARD=${{ matrix.std }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ + cmake -DCMAKE_CXX_STANDARD=${{ matrix.std }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.cmake_static }} \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx_compiler }} -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} -DTBB_CPF=${{ matrix.preview }} .. make VERBOSE=1 -j${MACOS_BUILD_CONCURRENCY} ctest --timeout ${TEST_TIMEOUT} --output-on-failure @@ -219,6 +239,15 @@ jobs: build_type: relwithdebinfo preview: 'ON' job_name: windows_cl2019_cxx14_relwithdebinfo_preview=ON + - os: windows-2019 + generator: Visual Studio 16 2019 + c_compiler: cl + cxx_compiler: cl + std: 20 + build_type: release + preview: 'ON' + job_name: windows_cl2019_cxx20_release_preview=ON-DBUILD_SHARED_LIBS=OFF + cmake_static: -DBUILD_SHARED_LIBS=OFF - os: windows-2022 generator: Visual Studio 17 2022 c_compiler: cl @@ -233,7 +262,7 @@ jobs: run: | mkdir build cd build - cmake -G "${{ matrix.generator }}" -A x64 -DCMAKE_CXX_STANDARD=${{ matrix.std }} ` + cmake -G "${{ matrix.generator }}" -A x64 -DCMAKE_CXX_STANDARD=${{ matrix.std }} ${{ matrix.cmake_static }} ` -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_CXX_COMPILER=${{ matrix.cxx_compiler }} ` -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} -DTBB_CPF=${{ matrix.preview }} .. cmake --build . --config ${{ matrix.build_type }} -j -v @@ -285,7 +314,7 @@ jobs: fail-fast: false matrix: include: - - os: macos-10.15 + - os: macos-12 c_compiler: clang cxx_compiler: clang++ std: 14 diff --git a/third-party/tbb/.github/workflows/issue_labeler.yml b/third-party/tbb/.github/workflows/issue_labeler.yml new file mode 100644 index 00000000..80591aa9 --- /dev/null +++ b/third-party/tbb/.github/workflows/issue_labeler.yml @@ -0,0 +1,37 @@ +# Copyright (c) 2023-2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "Issue Labeler" +on: + issues: + types: [opened, edited] + pull_request: + types: [opened, edited] + +permissions: read-all + +jobs: + triage: + runs-on: ubuntu-latest + permissions: + pull-requests: write + issues: write + contents: read + steps: + - uses: github/issue-labeler@v3.2 #May not be the latest version + with: + repo-token: "${{ secrets.GITHUB_TOKEN }}" + configuration-path: .github/issue_labeler.yml + enable-versioned-regex: 0 + sync-labels: 1 diff --git a/third-party/tbb/.github/workflows/labeler.yml b/third-party/tbb/.github/workflows/labeler.yml index 8dbb0962..36812ebd 100644 --- a/third-party/tbb/.github/workflows/labeler.yml +++ b/third-party/tbb/.github/workflows/labeler.yml @@ -1,4 +1,4 @@ -# Copyright (c) 2023 Intel Corporation +# Copyright (c) 2023-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,8 @@ name: "Pull Request Labeler" on: - pull_request_target +permissions: read-all + jobs: triage: permissions: diff --git a/third-party/tbb/BUILD.bazel b/third-party/tbb/BUILD.bazel index 3881d684..34f98eba 100644 --- a/third-party/tbb/BUILD.bazel +++ b/third-party/tbb/BUILD.bazel @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022 Intel Corporation +# Copyright (c) 2021-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -116,3 +116,16 @@ cc_library( ":tbbmalloc", ], ) + +cc_test( + name = "test_task", + srcs = [ + "test/tbb/test_task.cpp", + ] + glob([ + "test/common/*.h", + ]), + includes = ["test"], + deps = [ + ":tbb", + ], +) diff --git a/third-party/tbb/CMakeLists.txt b/third-party/tbb/CMakeLists.txt index 47872941..19232a99 100644 --- a/third-party/tbb/CMakeLists.txt +++ b/third-party/tbb/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023 Intel Corporation +# Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,10 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) # Enable CMake policies +if (POLICY CMP0068) + # RPATH settings do not affect install_name on macOS since CMake 3.9 + cmake_policy(SET CMP0068 NEW) +endif() + if (POLICY CMP0091) # The NEW behavior for this policy is to not place MSVC runtime library flags in the default # CMAKE__FLAGS_ cache entries and use CMAKE_MSVC_RUNTIME_LIBRARY abstraction instead. @@ -38,12 +43,6 @@ if (APPLE) endif() endif() -# Until CMake 3.4.0 FindThreads.cmake requires C language enabled. -# Enable C language before CXX to avoid possible override of CMAKE_SIZEOF_VOID_P. -if (CMAKE_VERSION VERSION_LESS 3.4) - enable_language(C) -endif() - file(READ include/oneapi/tbb/version.h _tbb_version_info) string(REGEX REPLACE ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1" _tbb_ver_major "${_tbb_version_info}") string(REGEX REPLACE ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1" _tbb_ver_minor "${_tbb_version_info}") @@ -104,8 +103,13 @@ option(TBBMALLOC_BUILD "Enable tbbmalloc build" ON) cmake_dependent_option(TBBMALLOC_PROXY_BUILD "Enable tbbmalloc_proxy build" ON "TBBMALLOC_BUILD" OFF) option(TBB_CPF "Enable preview features of the library" OFF) option(TBB_FIND_PACKAGE "Enable search for external oneTBB using find_package instead of build from sources" OFF) -option(TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH "Disable HWLOC automatic search by pkg-config tool" OFF) +option(TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH "Disable HWLOC automatic search by pkg-config tool" ${CMAKE_CROSSCOMPILING}) option(TBB_ENABLE_IPO "Enable Interprocedural Optimization (IPO) during the compilation" ON) +option(TBB_FUZZ_TESTING "Enable fuzz testing" OFF) +option(TBB_INSTALL "Enable installation" ON) +if(APPLE) +option(TBB_BUILD_APPLE_FRAMEWORKS "Build as Apple Frameworks" OFF) +endif() if (NOT DEFINED BUILD_SHARED_LIBS) set(BUILD_SHARED_LIBS ON) @@ -118,11 +122,6 @@ if (NOT BUILD_SHARED_LIBS) message(WARNING "You are building oneTBB as a static library. This is highly discouraged and such configuration is not supported. Consider building a dynamic library to avoid unforeseen issues.") endif() -# Prevent searching HWLOC by pkg-config on macOS -if (APPLE) - set(TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH ON) -endif() - if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "Build type" FORCE) message(STATUS "CMAKE_BUILD_TYPE is not specified. Using default: ${CMAKE_BUILD_TYPE}") @@ -197,6 +196,11 @@ endif() # ------------------------------------------------------------------- # Common dependencies +#force -pthread during compilation for Emscripten +if (EMSCRIPTEN AND NOT EMSCRIPTEN_WITHOUT_PTHREAD) + set(THREADS_HAVE_PTHREAD_ARG TRUE) +endif() + set(THREADS_PREFER_PTHREAD_FLAG TRUE) find_package(Threads REQUIRED) # ------------------------------------------------------------------- @@ -229,7 +233,7 @@ else() message(WARNING "TBB compiler settings not found ${TBB_COMPILER_SETTINGS_FILE}") endif() -if (TBB_FIND_PACKAGE OR TBB_DIR) +if (TBB_FIND_PACKAGE AND TBB_DIR) # Allow specifying external TBB to test with. # Do not add main targets and installation instructions in that case. message(STATUS "Using external TBB for testing") @@ -244,39 +248,44 @@ else() add_subdirectory(src/tbbmalloc_proxy) endif() endif() - if (APPLE OR NOT BUILD_SHARED_LIBS) + if (NOT BUILD_SHARED_LIBS) message(STATUS "TBBBind build targets are disabled due to unsupported environment") else() add_subdirectory(src/tbbbind) endif() + if (TBB_INSTALL) + # ------------------------------------------------------------------- + # Installation instructions + include(CMakePackageConfigHelpers) + + install(DIRECTORY include/ + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + COMPONENT devel) + + install(EXPORT ${PROJECT_NAME}Targets + NAMESPACE TBB:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME} + COMPONENT devel) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake + "include(\${CMAKE_CURRENT_LIST_DIR}/${PROJECT_NAME}Targets.cmake)\n") + if (NOT BUILD_SHARED_LIBS) + file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake + "include(CMakeFindDependencyMacro)\nfind_dependency(Threads)\n") + endif() - # ------------------------------------------------------------------- - # Installation instructions - include(CMakePackageConfigHelpers) - - install(DIRECTORY include/ - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} - COMPONENT devel) - - install(EXPORT ${PROJECT_NAME}Targets - NAMESPACE TBB:: - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME} - COMPONENT devel) - file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake - "include(\${CMAKE_CURRENT_LIST_DIR}/${PROJECT_NAME}Targets.cmake)\n") - - write_basic_package_version_file("${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" - COMPATIBILITY AnyNewerVersion) - - install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" - "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME} - COMPONENT devel) - - install(FILES "README.md" - DESTINATION ${CMAKE_INSTALL_DOCDIR} - COMPONENT devel) - # ------------------------------------------------------------------- + write_basic_package_version_file("${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" + COMPATIBILITY AnyNewerVersion) + + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME} + COMPONENT devel) + + install(FILES "README.md" + DESTINATION ${CMAKE_INSTALL_DOCDIR} + COMPONENT devel) + # ------------------------------------------------------------------- + endif() endif() if (TBB_TEST) diff --git a/third-party/tbb/CONTRIBUTING.md b/third-party/tbb/CONTRIBUTING.md index c8b43708..3048b211 100644 --- a/third-party/tbb/CONTRIBUTING.md +++ b/third-party/tbb/CONTRIBUTING.md @@ -29,11 +29,6 @@ The DCO is an attestation attached to every contribution made by every developer As a contributor, you’ll want to be familiar with the oneTBB project and the repository layout. You should also know how to use it as explained in the [oneTBB documentation](https://oneapi-src.github.io/oneTBB/) and how to set up your build development environment to configure, build, and test oneTBB as explained in the [oneTBB Build System Description](cmake/README.md). -## Issues -If you face a problem, first check out open [oneTBB GitHub issues](https://github.com/oneapi-src/oneTBB/issues) to see if the issue you’d like to address is already reported. You may find users that have encountered the bug you’re finding or have similar ideas for changes or additions. - -You can use issues to report a problem, make a feature request, or add comments on an existing issue. - ## Pull Requests You can find all [open oneTBB pull requests](https://github.com/oneapi-src/oneTBB/pulls) on GitHub. diff --git a/third-party/tbb/INSTALL.md b/third-party/tbb/INSTALL.md index 3c63c9fd..0ac95f87 100644 --- a/third-party/tbb/INSTALL.md +++ b/third-party/tbb/INSTALL.md @@ -61,7 +61,7 @@ You can use the ``install`` components for partial installation. The following install components are supported: - `runtime` - oneTBB runtime package (core shared libraries and `.dll` files on Windows* OS). - `devel` - oneTBB development package (header files, CMake integration files, library symbolic links, and `.lib` files on Windows* OS). -- `tbb4py` - [oneTBB Module for Python](#onetbb-python-module-support). +- `tbb4py` - [oneTBB Module for Python](https://github.com/oneapi-src/oneTBB/blob/master/python/README.md). If you want to install specific components after configuration and build, run: diff --git a/third-party/tbb/MODULE.bazel b/third-party/tbb/MODULE.bazel new file mode 100644 index 00000000..cc6698f0 --- /dev/null +++ b/third-party/tbb/MODULE.bazel @@ -0,0 +1,24 @@ +# Copyright (c) 2021-2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# DISCLAIMER: Bazel support is community-based. The maintainers do not +# use Bazel internally. The Bazel build can have security risks or +# optimization gaps. + +module( + name = "onetbb", + compatibility_level = 1, +) + +bazel_dep(name = "platforms", version = "0.0.9") diff --git a/third-party/tbb/README.md b/third-party/tbb/README.md index b96e1fb0..f2bc0a0a 100644 --- a/third-party/tbb/README.md +++ b/third-party/tbb/README.md @@ -23,7 +23,8 @@ oneTBB is a part of [oneAPI](https://oneapi.io). The current branch implements v > **_NOTE:_** Threading Building Blocks (TBB) is now called oneAPI Threading Building Blocks (oneTBB) to highlight that the tool is a part of the oneAPI ecosystem. ## Release Information -Here are [Release Notes](RELEASE_NOTES.md) and [System Requirements](SYSTEM_REQUIREMENTS.md). + +See [Release Notes](RELEASE_NOTES.md) and [System Requirements](SYSTEM_REQUIREMENTS.md). ## Documentation * [oneTBB Specification](https://spec.oneapi.com/versions/latest/elements/oneTBB/source/nested-index.html) @@ -39,7 +40,7 @@ Here are [Release Notes](RELEASE_NOTES.md) and [System Requirements](SYSTEM_REQU See [Installation from Sources](INSTALL.md) to learn how to install oneTBB. ## Support -Please report issues and suggestions via [GitHub issues](https://github.com/oneapi-src/oneTBB/issues). See our [documentation](./CONTRIBUTING.md##Issues) to learn how to work with them. +See our [documentation](./SUPPORT.md) to learn how to request help. ## How to Contribute We welcome community contributions, so check our [Contributing Guidelines](CONTRIBUTING.md) @@ -49,7 +50,6 @@ to learn more. oneAPI Threading Building Blocks is licensed under [Apache License, Version 2.0](LICENSE.txt). By its terms, contributions submitted to the project are also done under that license. - ## Engineering team contacts * [Email us.](mailto:inteltbbdevelopers@intel.com) diff --git a/third-party/tbb/RELEASE_NOTES.md b/third-party/tbb/RELEASE_NOTES.md index 57258416..c9b8e971 100644 --- a/third-party/tbb/RELEASE_NOTES.md +++ b/third-party/tbb/RELEASE_NOTES.md @@ -18,26 +18,25 @@ This document contains changes of oneTBB compared to the last release. ## Table of Contents -- [New Features](#new-features) - [Known Limitations](#known-limitations) - [Fixed Issues](#fixed-issues) -## :tada: New Features -- Since C++17, parallel algorithms and Flow Graph nodes are allowed to accept pointers to the member functions and member objects as the user-provided callables. -- Added missed member functions, such as assignment operators and swap function, to the ``concurrent_queue`` and ``concurrent_bounded_queue`` containers. - ## :rotating_light: Known Limitations -- A static assert will cause compilation failures in oneTBB headers when compiling with clang 12.0.0 or newer if using the LLVM standard library with ``-ffreestanding`` and C++11/14 compiler options. -- An application using Parallel STL algorithms in libstdc++ versions 9 and 10 may fail to compile due to incompatible interface changes between earlier versions of Threading Building Blocks (TBB) and oneAPI Threading Building Blocks (oneTBB). Disable support for Parallel STL algorithms by defining ``PSTL_USE_PARALLEL_POLICIES`` (in libstdc++ 9) or ``_GLIBCXX_USE_TBB_PAR_BACKEND`` (in libstdc++ 10) macro to zero before inclusion of the first standard header file in each translation unit. -- On Linux* OS, if oneAPI Threading Building Blocks (oneTBB) or Threading Building Blocks (TBB) are installed in a system folder like ``/usr/lib64``, the application may fail to link due to the order in which the linker searches for libraries. Use the ``-L`` linker option to specify the correct location of oneTBB library. This issue does not affect the program execution. -- The ``oneapi::tbb::info`` namespace interfaces might unexpectedly change the process affinity mask on Windows* OS systems (see https://github.com/open-mpi/hwloc/issues/366 for details) when using hwloc* version lower than 2.5. -- Using a hwloc* version other than 1.11, 2.0, or 2.5 may cause an undefined behavior on Windows* OS. See https://github.com/open-mpi/hwloc/issues/477 for details. -- The NUMA* topology may be detected incorrectly on Windows* OS machines where the number of NUMA* node threads exceeds the size of 1 processor group. -- On Windows* OS on ARM64*, when compiling an application using oneTBB with the Microsoft* Compiler, the compiler issues a warning C4324 that a structure was padded due to the alignment specifier. Consider suppressing the warning by specifying ``/wd4324`` to the compiler command line. -- oneTBB does not support ``fork()``, to work-around the issue, consider using task_scheduler_handle to join oneTBB worker threads before using fork(). -- C++ exception handling mechanism on Windows* OS on ARM64* might corrupt memory if an exception is thrown from any oneTBB parallel algorithm (see Windows* OS on ARM64* compiler issue: https://developercommunity.visualstudio.com/t/ARM64-incorrect-stack-unwinding-for-alig/1544293). +- The ``oneapi::tbb::info`` namespace interfaces might unexpectedly change the process affinity mask on Windows* OS systems (see https://github.com/open-mpi/hwloc/issues/366 for details) when using hwloc version lower than 2.5. +- Using a hwloc version other than 1.11, 2.0, or 2.5 may cause an undefined behavior on Windows OS. See https://github.com/open-mpi/hwloc/issues/477 for details. +- The NUMA topology may be detected incorrectly on Windows* OS machines where the number of NUMA node threads exceeds the size of 1 processor group. +- On Windows OS on ARM64*, when compiling an application using oneTBB with the Microsoft* Compiler, the compiler issues a warning C4324 that a structure was padded due to the alignment specifier. Consider suppressing the warning by specifying /wd4324 to the compiler command line. +- C++ exception handling mechanism on Windows* OS on ARM64* might corrupt memory if an exception is thrown from any oneTBB parallel algorithm (see Windows* OS on ARM64* compiler issue: https://developercommunity.visualstudio.com/t/ARM64-incorrect-stack-unwinding-for-alig/1544293. +- When CPU resource coordination is enabled, tasks from a lower-priority ``task_arena`` might be executed before tasks from a higher-priority ``task_arena``. + +> **_NOTE:_** To see known limitations that impact all versions of oneTBB, refer to [oneTBB Documentation](https://oneapi-src.github.io/oneTBB/main/intro/limitations.html). + ## :hammer: Fixed Issues -- Fixed the hang in the reserve method of concurrent unordered containers ([GitHub* #1056](http://github.com/oneapi-src/oneTBB/issues/1056)). -- Fixed the C++20 three-way comparison feature detection ([GitHub* #1093](http://github.com/oneapi-src/oneTBB/issues/1093)). -- Fixed oneTBB integration with CMake* in the Conda* environment. +- Fixed ``parallel_for_each`` algorithm behavior for iterators defining ``iterator_concept`` trait instead of ``iterator_category``. +- Fixed the redefinition issue for ``std::min`` and ``std::max`` on Windows* OS ([GitHub* #832](https://github.com/oneapi-src/oneTBB/issues/832)). +- Fixed the incorrect binary search order in ``TBBConfig.cmake``. +- Enabled the oneTBB library search using the pkg-config tool in Conda packages. + +## :octocat: Open-source Contributions Integrated +- Fixed the compiler warning for missing virtual destructor. Contributed by Elias Engelbert Plank (https://github.com/oneapi-src/oneTBB/pull/1215). diff --git a/third-party/tbb/SECURITY.md b/third-party/tbb/SECURITY.md index c4a49dd5..4926041f 100644 --- a/third-party/tbb/SECURITY.md +++ b/third-party/tbb/SECURITY.md @@ -1,7 +1,66 @@ # Security Policy -Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, -impact, severity and mitigation. +As an open-source project, we understand the importance of and responsibility +for security. This Security Policy outlines our guidelines and procedures to +ensure the highest level of security and trust for oneTBB users. -## Reporting a Vulnerability -Please report any security vulnerabilities in this project -[utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). +## Supported Versions +Security vulnerabilities are fixed in the [latest version][1] +and delivered as a patch release. We don't guarantee security fixes to be +back-ported to older oneTBB versions. + +## Report a Vulnerability +We are very grateful to the security researchers and users that report back +security vulnerabilities. We investigate every report thoroughly. +We strongly encourage you to report security vulnerabilities to us privately, +before disclosing them on public forums or opening a public GitHub* issue. + +Report a vulnerability to us in one of two ways: +* Open a draft **[GitHub* Security Advisory][2]** +* Send an e-mail to: **security@uxlfoundation.org**. +Along with the report, provide the following info: + * A descriptive title. + * Your name and affiliation (if any). + * A description of the technical details of the vulnerabilities. + * A minimal example of the vulnerability so we can reproduce your findings. + * An explanation of who can exploit this vulnerability, and what they gain + doing so. + * Whether this vulnerability is public or known to third parties. If it is, + provide details. + +### When Should I Report a Vulnerability? +* You think you discovered a potential security vulnerability in oneTBB. +* You are unsure how the potential vulnerability affects oneTBB. +* You think you discovered a vulnerability in another project or 3rd party +component on which oneTBB depends. If the issue is not fixed in the 3rd party +component, try to report directly there first. + +### When Should I NOT Report a Vulnerability? +* You got an automated scan hit and are unable to provide details. +* You need help using oneTBB for security. +* You need help applying security-related updates. +* Your issue is not security-related. + +## Security Reports Review Process +We aim to respond quickly to your inquiry and coordinate a fix and +disclosure with you. All confirmed security vulnerabilities will be addressed +according to severity level and impact on oneTBB. Normally, security issues +are fixed in the next planned release. + +## Disclosure Policy +We will publish security advisories using the +[**GitHub Security Advisories feature**][3] +to keep our community well-informed, and will credit you for your findings +unless you prefer to stay anonymous. We request that you refrain from +exploiting the vulnerability or making it public before the official disclosure. + +We will disclose the vulnerabilities and bugs as soon as possible once +mitigation is implemented and available. + +## Feedback on This Policy +If you have any suggestions on how this Policy could be improved, submit +an issue or a pull request to this repository. **Do not** report +potential vulnerabilities or security flaws via a pull request. + +[1]: https://github.com/oneapi-src/oneTBB/releases/latest +[2]: https://github.com/oneapi-src/oneTBB/security/advisories/new +[3]: https://github.com/oneapi-src/oneTBB/security/advisories diff --git a/third-party/tbb/SUPPORT.md b/third-party/tbb/SUPPORT.md new file mode 100644 index 00000000..47bb60a5 --- /dev/null +++ b/third-party/tbb/SUPPORT.md @@ -0,0 +1,35 @@ + + +# oneTBB Support + +We are committed to providing support and assistance to help you make the most out of oneTBB. +Use the following methods if you face any challenges. + +## Issues + +If you have a problem, check out the [GitHub Issues](https://github.com/oneapi-src/oneTBB/issues) to see if the issue you want to address is already reported. +You may find users that have encountered the same bug or have similar ideas for changes or updates. + +You can use issues to report a problem, make a feature request, or add comments on an existing issue. + +## Discussions + +Visit the [GitHub Discussions](https://github.com/oneapi-src/oneTBB/discussions) to engage with the community, ask questions, or help others. + +## Email + +Reach out to us privately via [email](mailto:inteltbbdevelopers@intel.com). \ No newline at end of file diff --git a/third-party/tbb/SYSTEM_REQUIREMENTS.md b/third-party/tbb/SYSTEM_REQUIREMENTS.md index 803041c6..7f9d8161 100644 --- a/third-party/tbb/SYSTEM_REQUIREMENTS.md +++ b/third-party/tbb/SYSTEM_REQUIREMENTS.md @@ -44,10 +44,10 @@ This document provides details about hardware, operating system, and software pr - Microsoft* Windows* Server 2022 - Systems with Linux* operating systems: - Oracle Linux* 8 - - Amazon* Linux* 2 + - Amazon* Linux 2, 2022 - Debian* 9, 10, 11 - - Fedora* 36, 37 - - Rocky* Linux* 9 + - Fedora* 36, 37, 38 + - Rocky* Linux* 8, 9 - Red Hat* Enterprise Linux* 8, 9 - SuSE* Linux* Enterprise Server 15 - Ubuntu* 20.04, 22.04 @@ -64,12 +64,12 @@ This document provides details about hardware, operating system, and software pr ### Supported Compilers - Intel* oneAPI DPC++/C++ Compiler -- Intel* C++ Compiler 19.0 and 19.1 version +- Intel® C++ Compiler Classic 2021.1 - 2021.9 - Microsoft* Visual C++ 14.2 (Microsoft* Visual Studio* 2019, Windows* OS only) - Microsoft* Visual C++ 14.3 (Microsoft* Visual Studio* 2022, Windows* OS only) - For each supported Linux* operating system, the standard gcc version provided with that operating system is supported: - - GNU Compilers (gcc) 4.8.5 - 11.2.1 - - GNU C Library (glibc) version 2.17 - 2.34 + - GNU Compilers (gcc) 8.x – 12.x + - GNU C Library (glibc) version 2.28 – 2.36 - Clang* 6.0.0 - 13.0.0 ## Limitations diff --git a/third-party/tbb/WASM_Support.md b/third-party/tbb/WASM_Support.md index 67925ee4..8c2f6c1a 100644 --- a/third-party/tbb/WASM_Support.md +++ b/third-party/tbb/WASM_Support.md @@ -16,16 +16,45 @@ # WASM Support +oneTBB extends its capabilities by offering robust support for ``WASM``. + ``WASM`` stands for WebAssembly, a low-level binary format for executing code in web browsers. -It is designed to be a portable target for compilers and to be efficient to parse and execute. +It is designed to be a portable target for compilers and efficient to parse and execute. + +Using oneTBB with WASM, you can take full advantage of parallelism and concurrency while working on web-based applications, interactive websites, and a variety of other WASM-compatible platforms. + +oneTBB offers WASM support through the integration with [Emscripten*](https://emscripten.org/docs/introducing_emscripten/index.html), a powerful toolchain for compiling C and C++ code into WASM-compatible runtimes. + +## Build + +**Prerequisites:** Download and install Emscripten*. See the [instructions](https://emscripten.org/docs/getting_started/downloads.html). + +To build the system, run: + +``` +mkdir build && cd build +emcmake cmake .. -DCMAKE_CXX_COMPILER=em++ -DCMAKE_C_COMPILER=emcc -DTBB_STRICT=OFF -DCMAKE_CXX_FLAGS=-Wno-unused-command-line-argument -DTBB_DISABLE_HWLOC_AUTOMATIC_SEARCH=ON -DBUILD_SHARED_LIBS=ON -DTBB_EXAMPLES=ON -DTBB_TEST=ON +``` +To compile oneTBB without ``pthreads``, set the flag ``-DEMSCRIPTEN_WITHOUT_PTHREAD=true`` in the command above. By default, oneTBB uses the ``pthreads``. +``` +cmake --build . +cmake --install . +``` +Where: + +* ``emcmake`` - a tool that sets up the environment for Emscripten*. +* ``-DCMAKE_CXX_COMPILER=em++`` - specifies the C++ compiler as Emscripten* C++ compiler. +* ``-DCMAKE_C_COMPILER=emcc`` - specifies the C compiler as Emscripten* C compiler. + -WebAssembly aims to provide a fast, efficient, and safe way to run code in web browsers without needing plugins or other software. Code written in a variety of programming languages, including C, C++, Rust and others, can be compiled into WebAssembly format for use in web pages. This allows you to write high-performance applications that run directly in the browser. +> **_NOTE:_** See [CMake documentation](https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md) to learn about other options. -We currently have an [under development branch that provides you with WASM support](https://github.com/oneapi-src/oneTBB/tree/tbb_wasm). -By using WASM, you can: -* Create highly performant and scalable applications that can meet the demands of modern web-based systems. -* Take advantage of oneTBB features to optimize the performance of your web-based applications. +## Run Test +To run tests, use: +``` +ctest +``` diff --git a/third-party/tbb/WORKSPACE.bazel b/third-party/tbb/WORKSPACE.bazel index 6431b29b..59ba39f7 100644 --- a/third-party/tbb/WORKSPACE.bazel +++ b/third-party/tbb/WORKSPACE.bazel @@ -1,4 +1,4 @@ -# Copyright (c) 2021 Intel Corporation +# Copyright (c) 2021-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,4 +16,4 @@ # use Bazel internally. The Bazel build can have security risks or # optimization gaps. -workspace(name = "oneTBB") +# WORKSPACE marker file needed by Bazel diff --git a/third-party/tbb/cmake/README.md b/third-party/tbb/cmake/README.md index 0734b1f8..aa811b0f 100644 --- a/third-party/tbb/cmake/README.md +++ b/third-party/tbb/cmake/README.md @@ -1,30 +1,32 @@ -# Build system description +# Build System Description -The project uses CMake build configuration. +The project uses CMake* build configuration. The following controls are available during the configure stage: ``` TBB_TEST:BOOL - Enable testing (ON by default) TBB_STRICT:BOOL - Treat compiler warnings as errors (ON by default) TBB_SANITIZE:STRING - Sanitizer parameter, passed to compiler/linker -TBB_SIGNTOOL:FILEPATH - Tool for digital signing, used in post install step for libraries if provided. +TBB_SIGNTOOL:FILEPATH - Tool for digital signing, used in post-install step for libraries if provided. TBB_SIGNTOOL_ARGS:STRING - Additional arguments for TBB_SIGNTOOL, used if TBB_SIGNTOOL is set. TBB_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) build (ON by default) TBBMALLOC_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) memory allocator build (ON by default) TBBMALLOC_PROXY_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) memory allocator proxy build (requires TBBMALLOC_BUILD. ON by default) TBB4PY_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) Python module build (OFF by default) TBB_CPF:BOOL - Enable preview features of the library (OFF by default) +TBB_INSTALL:BOOL - Enable installation (ON by default) TBB_INSTALL_VARS:BOOL - Enable auto-generated vars installation(packages generated by `cpack` and `make install` will also include the vars script)(OFF by default) TBB_VALGRIND_MEMCHECK:BOOL - Enable scan for memory leaks using Valgrind (OFF by default) TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH - Disable HWLOC automatic search by pkg-config tool (OFF by default) TBB_ENABLE_IPO - Enable Interprocedural Optimization (IPO) during the compilation (ON by default) +TBB_BUILD_APPLE_FRAMEWORKS - Enable the Apple* frameworks instead of dylibs, only available on the Apple platform. (OFF by default) ``` -## Configure, build, and test +## Configure, Build, and Test ### Preparation -To perform out-of-source build, create a build directory and go there: +To perform an out-of-source build, create a build directory and go there: ```bash mkdir /tmp/my-build @@ -39,14 +41,16 @@ cmake Some useful options: - `-G ` - specify particular project generator. See `cmake --help` for details. -- `-DCMAKE_BUILD_TYPE=Debug` - specify for Debug build. It is not applicable for multiconfig generators, e.g. for Visual Studio* generator. +- `-DCMAKE_BUILD_TYPE=Debug` - specify for Debug build. It is not applicable for multi-config generators, e.g., Microsoft* Visual Studio* generator. -#### TBBBind library configuration +#### TBBBind Library Configuration -The TBBbind library has three versions: `tbbbind`, `tbbbind_2_0`, and `tbbbind_2_5`. Each of these versions is linked with corresponding HWLOC library version: -- `tbbbind` links with HWLOC 1.11.x -- `tbbbind_2_0` links with HWLOC 2.1–2.4 -- `tbbbind_2_5` links with HWLOC 2.5 and later +> **_TIP:_** It is recommended to install the HWLOC* library. See [oneTBB documentation](https://oneapi-src.github.io/oneTBB/GSG/next_steps.html#hybrid-cpu-and-numa-support) for details. + +The TBBbind library has three versions: `tbbbind`, `tbbbind_2_0`, and `tbbbind_2_5`. Each of these versions is linked with the corresponding HWLOC* library version: +- `tbbbind` links with `HWLOC 1.11.x` +- `tbbbind_2_0` links with `HWLOC 2.1–2.4` +- `tbbbind_2_5` links with `HWLOC 2.5` and later The search for a suitable version of the HWLOC library is enabled by default. If you want to use a specific version of the library, you can specify the path to it manually using the following CMake variables: @@ -63,17 +67,17 @@ The search for a suitable version of the HWLOC library is enabled by default. If Windows* OS requires an additional variable for correct TBBBind library building: - `CMAKE_HWLOC__DLL_PATH` - path to the corresponding HWLOC version `.dll` file. -`HWLOC_VER` substring used earlier can be replaced with one of the three values: +The `HWLOC_VER` substring used earlier can be replaced with one of the three values: - `1_11` for the `tbbbind` library configuration - `2` for the `tbbbind_2_0` library configuration - `2_5` for the `tbbbind_2_5` library configuration -If you specify variables for several TBBBind versions, the building process for all of these versions is performed during single build session. +If you specify variables for several TBBBind versions, the building process for all of these versions is performed during a single build session. --- **TIP** -Specify the `TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH` to disable HWLOC libraries automatic search. +Specify the `TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH` to turn off the HWLOC library's automatic search. --- @@ -85,24 +89,24 @@ cmake --build . ``` Some useful options: -- `--target ` - specific target, "all" is default. -- `--config ` - build configuration, applicable only for multiconfig generators, e.g. Visual Studio* generator. +- `--target ` - specific target, "all" is the default. +- `--config ` - build configuration, applicable only for multi-config generators, e.g., Visual Studio* generator. The binaries are placed to `./__cxx_`. For example, `./gnu_4.8_cxx11_release`. -#### Build for 32-bit +#### Build For 32-bit * **Intel(R) Compiler**. Source Intel(R) C++ Compiler with `ia32` and build as usual. -* **MSVC**. Use switch for [generator](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html) (e.g. `-A Win32` for [VS2019](https://cmake.org/cmake/help/latest/generator/Visual%20Studio%2016%202019.html)) during the configuration stage and then build as usual. +* **MSVC**. Use switch for [generator](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html) (e.g., `-A Win32` for [VS2019](https://cmake.org/cmake/help/latest/generator/Visual%20Studio%2016%202019.html)) during the configuration stage and then build as usual. * **GCC/Clang**. Specify `-m32` during the configuration. It can be `CXXFLAGS=-m32 cmake ..` or `cmake -DCMAKE_CXX_FLAGS=-m32 ..` -* For any other compiler, which builds for 64-bit by default, specify 32-bit compiler key during the configuration as above. +* For any other compiler, which builds for 64-bit by default, specify a 32-bit compiler key during the configuration as above. -#### Windows* OS specific builds +#### Windows* OS-Specific Builds --- **NOTE** -Following builds require CMake version 3.15 or higher. +The following builds require CMake version 3.15 or higher. --- @@ -123,12 +127,12 @@ cmake -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded .. ```bash cmake -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreadedDebug -DCMAKE_BUILD_TYPE=Debug .. ``` -* **Windows* OS 10 Universal Windows application build**. Set `CMAKE_SYSTEM_NAME` to `WindowsStore` and `CMAKE_SYSTEM_VERSION` to `10.0`. +* **Windows OS 10 Universal Windows application build**. Set `CMAKE_SYSTEM_NAME` to `WindowsStore` and `CMAKE_SYSTEM_VERSION` to `10.0`. --- **NOTE** -Set `TBB_NO_APPCONTAINER` to `ON` in order to apply `/APPCONTAINER:NO` option during the compilation (used for testing). +Set `TBB_NO_APPCONTAINER` to `ON` to apply the `/APPCONTAINER:NO` option during the compilation (used for testing). --- @@ -136,16 +140,31 @@ Set `TBB_NO_APPCONTAINER` to `ON` in order to apply `/APPCONTAINER:NO` option du cmake -DCMAKE_SYSTEM_NAME:STRING=WindowsStore -DCMAKE_SYSTEM_VERSION:STRING=10.0 .. ``` -* **Universal Windows* OS Driver build**. Set `TBB_WINDOWS_DRIVER` to `ON` and use static linkage with CRT. +* **Universal Windows OS Driver build**. Set `TBB_WINDOWS_DRIVER` to `ON` and use static linkage with CRT. ```bash cmake -DTBB_WINDOWS_DRIVER=ON -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded .. ``` +#### Example + +```bash +cmake -DCMAKE_CXX_COMPILER=icpc -DCMAKE_C_COMPILER=icc -DTBB_TEST=off -DCMAKE_HWLOC_1_11_LIBRARY_PATH=/libhwloc.so.15 +-DCMAKE_HWLOC_1_11_INCLUDE_PATH= -DCMAKE_INSTALL_PREFIX=/oneTBB_install .. +make -j8 && make install +``` + +--- +**NOTE** + +The library path points to a file, while the include path points to a directory and not to ``hwloc.h``. + +--- + ### Test #### Build test -To build a test, use the default target 'all': +To build a test, use the default target ``all``: ``` cmake --build . ``` @@ -155,14 +174,14 @@ Or use a specific test target: cmake --build . --target # e.g. test_version ``` -#### Run test +#### Run Test You can run a test by using CTest: ```bash ctest ``` -Or by using 'test' target: +Or by using the ``test`` target: ```bash cmake --build . --target test # currently does not work on Windows* OS ``` @@ -170,7 +189,7 @@ cmake --build . --target test # currently does not work on Windows* OS ## Installation See [Installation from Sources](../INSTALL.md) to learn how to install oneTBB. -## Sanitizers - Ñonfigure, build, and run +## Sanitizers - Configure, Build, and Run ```bash mkdir build @@ -180,7 +199,7 @@ make -j ctest -V ``` -## Valgrind memcheck - configure, build, and run +## Valgrind Memcheck - Configure, Build, and Run ### Prerequisites * Valgrind tool executable @@ -192,9 +211,9 @@ cmake -DTBB_VALGRIND_MEMCHECK=ON .. make -j memcheck- # or memcheck-all to scan all tests ``` -## Test specification +## Test Specification -Use Doxygen to generate oneTBB test specification: +Use Doxygen* to generate oneTBB test specification: ```bash mkdir build @@ -203,12 +222,12 @@ cmake -DTBB_TEST_SPEC=ON .. make test_spec ``` -## TBBConfig - integration of binary packages +## TBBConfig - Integration of Binary Packages -It is a configuration module that is used for integration of prebuilt oneTBB. It consists of two files (TBBConfig.cmake and TBBConfigVersion.cmake) and can be used via [find_package](https://cmake.org/cmake/help/latest/command/find_package.html) function. +It is a configuration module that is used for the integration of prebuilt oneTBB. It consists of two files (``TBBConfig.cmake`` and ``TBBConfigVersion.cmake``) and can be used via the [find_package](https://cmake.org/cmake/help/latest/command/find_package.html) function. -How to use this module in your CMake project: - 1. Let CMake know where to search for TBBConfig, e.g. specify location of TBBConfig.cmake in `TBB_DIR` (for more details about search paths see [find_package](https://cmake.org/cmake/help/latest/command/find_package.html)). +To use this module in your CMake project: + 1. Let CMake know where to search for TBBConfig, e.g. specify the location of ``TBBConfig.cmake`` in `TBB_DIR` (for more details about search paths, see [find_package](https://cmake.org/cmake/help/latest/command/find_package.html)). 2. Use [find_package](https://cmake.org/cmake/help/latest/command/find_package.html) to find oneTBB. 3. Use provided variables and/or imported targets (described below) to work with the found oneTBB. @@ -223,14 +242,14 @@ target_link_libraries(foo TBB::tbb) oneTBB components can be passed to [find_package](https://cmake.org/cmake/help/latest/command/find_package.html) after keyword ``COMPONENTS`` or ``REQUIRED``. Use basic names of components (`tbb`, `tbbmalloc`, etc.). -If components are not specified then the default set is used: `tbb`, `tbbmalloc` and ``tbbmalloc_proxy``. +If components are not specified, then the default set is used: `tbb`, `tbbmalloc`, and ``tbbmalloc_proxy``. -If `tbbmalloc_proxy` is requested, `tbbmalloc` component will also be added and set as dependency for `tbbmalloc_proxy`. +If `tbbmalloc_proxy` is requested, the `tbbmalloc` component is also added and set as a dependency for `tbbmalloc_proxy`. TBBConfig creates [imported targets](https://cmake.org/cmake/help/latest/manual/cmake-buildsystem.7.html#imported-targets>) as -shared libraries using the following format: `TBB::` (for example, `TBB::tbb`, `TBB::tbbmalloc`). +shared libraries using the following format: `TBB::`. For example, `TBB::tbb` or `TBB::tbbmalloc`. -Set `TBB_FIND_RELEASE_ONLY` to `TRUE` before calling `find_package` in order to search only for release oneTBB version. This variable helps to avoid simultaneous linkage of release and debug oneTBB versions when CMake configuration is `Debug` but a third-party component depends on release oneTBB version. +To search only for release oneTBB version, set `TBB_FIND_RELEASE_ONLY` to `TRUE` before calling `find_package`. This variable helps to avoid simultaneous linkage of release and debug oneTBB versions when CMake configuration is `Debug,` but a third-party component depends on the release oneTBB version. Variables set during TBB configuration: @@ -241,14 +260,14 @@ Variable | Description `TBB_VERSION` | oneTBB version (format: `...`) `TBB_IMPORTED_TARGETS` | All created oneTBB imported targets (not supported for builds from source code) -Starting from [oneTBB 2021.1](https://github.com/oneapi-src/oneTBB/releases/tag/v2021.1) GitHub release TBBConfig files in the binary packages are located under `/lib/cmake/TBB`. +Starting from [oneTBB 2021.1](https://github.com/oneapi-src/oneTBB/releases/tag/v2021.1), GitHub* release TBBConfig files in the binary packages are located under `/lib/cmake/TBB`. For example, `TBB_DIR` should be set to `/lib/cmake/TBB`. TBBConfig files are automatically created during the build from source code and can be installed together with the library. -Also oneTBB provides a helper function that creates TBBConfig files from predefined templates: see `tbb_generate_config` in `cmake/config_generation.cmake`. +Also, oneTBB provides a helper function that creates TBBConfig files from predefined templates. See `tbb_generate_config` in `cmake/config_generation.cmake`. -## oneTBB Python Module support -`TBB4PY_BUILD` Cmake option provides ability to build Python module for oneTBB. +## oneTBB Python Module Support +The `TBB4PY_BUILD` Cmake option provides the ability to build a Python module for oneTBB. ### Targets: - `irml` - build IPC RML server @@ -258,30 +277,30 @@ Also oneTBB provides a helper function that creates TBBConfig files from predefi - Python version 3.5 or newer - SWIG version 3.0.6 or newer -## CMake files +## CMake Files -### Compile/link options +### Compile and Link Options Compile and link options may be specific for certain compilers. This part is handled in `cmake/compilers/*` files. Options in TBB CMake are handled via variables in two ways for convenience: * by options group -* by specific option +* by the specific option -#### Options group +#### Options Group -Naming convention is the following: `TBB___`, where +Naming convention is the following: `TBB___`, where: -* `` could be +* `` can be: * `LIB` - options applied during libraries build. * `TEST` - options applied during test build. * `BENCH` - options applied during benchmarks build. * `COMMON` - options applied during all (libraries, test, benchmarks) builds. -* `` could be +* `` can be: * `COMPILE` - options applied during the compilation. * `LINK` - options applied during the linkage. -* `` could be +* `` can be: * `FLAGS` - list of flags * `LIBS` - list of libraries @@ -289,17 +308,17 @@ Naming convention is the following: `TBB___`, where Variable | Description --- | --- -`TBB_COMMON_COMPILE_FLAGS` | Applied to libraries, tests and benchmarks as compile options +`TBB_COMMON_COMPILE_FLAGS` | Applied to libraries, tests, and benchmarks as compile options `TBB_LIB_LINK_FLAGS` | Applied to libraries as link options `TBB_LIB_LINK_LIBS ` | Applied to libraries as link libraries `TBB_TEST_COMPILE_FLAGS` | Applied to tests as compile options -Please specify the `LINK` options prefixed with dash(-) for MSVC(Visual Studio) compiler with CMake < 3.13 to avoid issues caused by `target_link_libraries` CMake command usage. +Specify the `LINK` options prefixed with a dash(-) for MSVC(Visual Studio) compiler with CMake < 3.13 to avoid issues caused by `target_link_libraries` CMake command usage. -#### Specific options +#### Specific Options -If the option used only in part of the places (library, tests, benchmarks) as well as adding this option to the group of other options is not possible, +If the option is used only in part of the places (library, tests, benchmarks) and adding this option to the group of other options is not possible, then the option must be named using common sense. -Warnings supperssions should be added into `TBB_WARNING_SUPPRESS` variable which is applied during the compilation of libraries, tests and benchmarks. -Additional warnings should be added into `TBB_WARNING_TEST_FLAGS` variable which is applied during the compilation of tests. +Warning suppressions should be added to the `TBB_WARNING_SUPPRESS` variable, which is applied during the compilation of libraries, tests, and benchmarks. +Additional warnings should be added to the `TBB_WARNING_TEST_FLAGS` variable, which is applied during the compilation of tests. diff --git a/third-party/tbb/cmake/compilers/Clang.cmake b/third-party/tbb/cmake/compilers/Clang.cmake index a128e133..f56b5fba 100644 --- a/third-party/tbb/cmake/compilers/Clang.cmake +++ b/third-party/tbb/cmake/compilers/Clang.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023 Intel Corporation +# Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +if (EMSCRIPTEN) + set(TBB_EMSCRIPTEN 1) + set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fexceptions) + set(TBB_TEST_LINK_FLAGS ${TBB_COMMON_LINK_FLAGS} -fexceptions -sINITIAL_MEMORY=65536000 -sALLOW_MEMORY_GROWTH=1 -sEXIT_RUNTIME=1) + if (NOT EMSCRIPTEN_WITHOUT_PTHREAD) + set_property(TARGET Threads::Threads PROPERTY INTERFACE_LINK_LIBRARIES "-pthread") + endif() +endif() + if (MINGW) set(TBB_LINK_DEF_FILE_FLAG "") set(TBB_DEF_FILE_PREFIX "") @@ -45,18 +54,23 @@ if (NOT TBB_STRICT AND COMMAND tbb_remove_compile_flag) endif() # Enable Intel(R) Transactional Synchronization Extensions (-mrtm) and WAITPKG instructions support (-mwaitpkg) on relevant processors -if (CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|amd64|i.86|x86)") +if (CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|amd64|i.86|x86)" AND NOT EMSCRIPTEN) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -mrtm $<$>:-mwaitpkg>) endif() # Clang flags to prevent compiler from optimizing out security checks -set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -Wformat -Wformat-security -Werror=format-security - -fstack-protector-strong -fPIC) -set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -Wl,-z,relro,-z,now) +set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -Wformat -Wformat-security -Werror=format-security -fPIC $<$>:-fstack-protector-strong>) + +# -z switch is not supported on MacOS +if (NOT APPLE) + set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -Wl,-z,relro,-z,now) +endif() set(TBB_COMMON_LINK_LIBS ${CMAKE_DL_LIBS}) -set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$>:-D_FORTIFY_SOURCE=2>) +if (NOT CMAKE_CXX_FLAGS MATCHES "_FORTIFY_SOURCE") + set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$>:-D_FORTIFY_SOURCE=2>) +endif () if (MINGW) list(APPEND TBB_COMMON_COMPILE_FLAGS -U__STRICT_ANSI__) diff --git a/third-party/tbb/cmake/compilers/GNU.cmake b/third-party/tbb/cmake/compilers/GNU.cmake index b60172c8..6fd8d980 100644 --- a/third-party/tbb/cmake/compilers/GNU.cmake +++ b/third-party/tbb/cmake/compilers/GNU.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023 Intel Corporation +# Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -40,9 +40,7 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|amd64|i.86|x86)") set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -mrtm $<$>,$>>:-mwaitpkg>) endif() -if (NOT MINGW) - set(TBB_COMMON_LINK_LIBS dl) -endif() +set(TBB_COMMON_LINK_LIBS ${CMAKE_DL_LIBS}) # Ignore -Werror set through add_compile_options() or added to CMAKE_CXX_FLAGS if TBB_STRICT is disabled. if (NOT TBB_STRICT AND COMMAND tbb_remove_compile_flag) @@ -73,9 +71,13 @@ endif () set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fno-strict-overflow -fno-delete-null-pointer-checks -fwrapv) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -Wformat -Wformat-security -Werror=format-security -fstack-protector-strong ) -set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -Wl,-z,relro,-z,now,-z,noexecstack) -set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$>:-D_FORTIFY_SOURCE=2> ) - +# -z switch is not supported on MacOS and MinGW +if (NOT APPLE AND NOT MINGW) + set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -Wl,-z,relro,-z,now,-z,noexecstack) +endif() +if (NOT CMAKE_CXX_FLAGS MATCHES "_FORTIFY_SOURCE") + set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$>:-D_FORTIFY_SOURCE=2> ) +endif () # TBB malloc settings set(TBBMALLOC_LIB_COMPILE_FLAGS -fno-rtti -fno-exceptions) diff --git a/third-party/tbb/cmake/compilers/Intel.cmake b/third-party/tbb/cmake/compilers/Intel.cmake index 582f9a84..531e078e 100644 --- a/third-party/tbb/cmake/compilers/Intel.cmake +++ b/third-party/tbb/cmake/compilers/Intel.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023 Intel Corporation +# Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,7 +21,11 @@ if (MSVC) elseif (APPLE) include(${CMAKE_CURRENT_LIST_DIR}/AppleClang.cmake) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fstack-protector -Wformat -Wformat-security - $<$>:-fno-omit-frame-pointer -qno-opt-report-embed -D_FORTIFY_SOURCE=2>) + $<$>:-fno-omit-frame-pointer -qno-opt-report-embed>) + if (NOT CMAKE_CXX_FLAGS MATCHES "_FORTIFY_SOURCE") + set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$>:-D_FORTIFY_SOURCE=2>) + endif () + set(TBB_OPENMP_FLAG -qopenmp) set(TBB_IPO_COMPILE_FLAGS $<$>:-ipo>) else() diff --git a/third-party/tbb/cmake/compilers/IntelLLVM.cmake b/third-party/tbb/cmake/compilers/IntelLLVM.cmake index 89d56ae6..a9ebb3e6 100644 --- a/third-party/tbb/cmake/compilers/IntelLLVM.cmake +++ b/third-party/tbb/cmake/compilers/IntelLLVM.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,10 +14,12 @@ if (WIN32) include(${CMAKE_CURRENT_LIST_DIR}/MSVC.cmake) + set(TBB_OPENMP_FLAG /Qopenmp) set(TBB_IPO_COMPILE_FLAGS $<$>:/Qipo>) set(TBB_IPO_LINK_FLAGS $<$>:/INCREMENTAL:NO>) else() include(${CMAKE_CURRENT_LIST_DIR}/Clang.cmake) set(TBB_IPO_COMPILE_FLAGS $<$>:-ipo>) + set(TBB_OPENMP_FLAG -qopenmp) endif() set(TBB_IPO_LINK_FLAGS ${TBB_IPO_LINK_FLAGS} ${TBB_IPO_COMPILE_FLAGS}) diff --git a/third-party/tbb/cmake/compilers/MSVC.cmake b/third-party/tbb/cmake/compilers/MSVC.cmake index 0e0dfd31..6568ec7e 100644 --- a/third-party/tbb/cmake/compilers/MSVC.cmake +++ b/third-party/tbb/cmake/compilers/MSVC.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023 Intel Corporation +# Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,9 +33,9 @@ if (MSVC_VERSION LESS_EQUAL 1900) set(TBB_TEST_COMPILE_FLAGS ${TBB_TEST_COMPILE_FLAGS} /wd4503) endif() set(TBB_LIB_COMPILE_FLAGS -D_CRT_SECURE_NO_WARNINGS /GS) -set(TBB_COMMON_COMPILE_FLAGS /volatile:iso /FS /EHsc) +set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} /volatile:iso /FS /EHsc) -set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} /DYNAMICBASE /NXCOMPAT) +set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} /DEPENDENTLOADFLAG:0x2000 /DYNAMICBASE /NXCOMPAT) if (TBB_ARCH EQUAL 32) set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} /SAFESEH ) diff --git a/third-party/tbb/cmake/config_generation.cmake b/third-party/tbb/cmake/config_generation.cmake index 0cbdd745..e4ef7bce 100644 --- a/third-party/tbb/cmake/config_generation.cmake +++ b/third-party/tbb/cmake/config_generation.cmake @@ -92,6 +92,7 @@ set(_tbbbind_bin_version ${tbb_gen_cfg_TBBBIND_BINARY_VERSION}) NAMES \${_tbb_component}\${_bin_version}.dll PATHS \${_tbb_root} PATH_SUFFIXES \"redist/\${_tbb_intel_arch}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\" \"bin\" + NO_DEFAULT_PATH ) if (EXISTS \"\${_tbb_debug_lib}\") @@ -99,6 +100,7 @@ set(_tbbbind_bin_version ${tbb_gen_cfg_TBBBIND_BINARY_VERSION}) NAMES \${_tbb_component}\${_bin_version}_debug.dll PATHS \${_tbb_root} PATH_SUFFIXES \"redist/\${_tbb_intel_arch}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\" \"bin\" + NO_DEFAULT_PATH ) endif() ") diff --git a/third-party/tbb/cmake/hwloc_detection.cmake b/third-party/tbb/cmake/hwloc_detection.cmake index 47233b17..aaca5a59 100644 --- a/third-party/tbb/cmake/hwloc_detection.cmake +++ b/third-party/tbb/cmake/hwloc_detection.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -46,8 +46,6 @@ endforeach() unset(HWLOC_TARGET_NAME) if (NOT HWLOC_TARGET_EXPLICITLY_DEFINED AND - # No hwloc auto detection for cross compilation - NOT CMAKE_CROSSCOMPILING AND NOT TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH ) find_package(PkgConfig QUIET) diff --git a/third-party/tbb/cmake/resumable_tasks.cmake b/third-party/tbb/cmake/resumable_tasks.cmake new file mode 100644 index 00000000..d379d4ed --- /dev/null +++ b/third-party/tbb/cmake/resumable_tasks.cmake @@ -0,0 +1,31 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include(CheckSymbolExists) + +if (UNIX) + set(CMAKE_REQUIRED_FLAGS -Wno-deprecated-declarations) + if (APPLE) + set(CMAKE_REQUIRED_DEFINITIONS -D_XOPEN_SOURCE) + endif() + + check_symbol_exists("getcontext" "ucontext.h" _tbb_have_ucontext) + if (NOT _tbb_have_ucontext) + set(TBB_RESUMABLE_TASKS_USE_THREADS "__TBB_RESUMABLE_TASKS_USE_THREADS=1") + endif() + + unset(_tbb_have_ucontext) + unset(CMAKE_REQUIRED_DEFINITIONS) + unset(CMAKE_REQUIRED_FLAGS) +endif() diff --git a/third-party/tbb/cmake/templates/TBBConfig.cmake.in b/third-party/tbb/cmake/templates/TBBConfig.cmake.in index 18ac68d3..3131e3dd 100644 --- a/third-party/tbb/cmake/templates/TBBConfig.cmake.in +++ b/third-party/tbb/cmake/templates/TBBConfig.cmake.in @@ -65,6 +65,7 @@ foreach (_tbb_component ${TBB_FIND_COMPONENTS}) NAMES @TBB_LIB_PREFIX@${_tbb_component}${_bin_version}.@TBB_LIB_EXT@ PATHS ${_tbb_root} PATH_SUFFIXES "@TBB_LIB_REL_PATH@/${_tbb_intel_arch}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}" "@TBB_LIB_REL_PATH@" + NO_DEFAULT_PATH ) if (NOT TBB_FIND_RELEASE_ONLY) @@ -72,6 +73,7 @@ foreach (_tbb_component ${TBB_FIND_COMPONENTS}) NAMES @TBB_LIB_PREFIX@${_tbb_component}${_bin_version}_debug.@TBB_LIB_EXT@ PATHS ${_tbb_root} PATH_SUFFIXES "@TBB_LIB_REL_PATH@/${_tbb_intel_arch}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}" "@TBB_LIB_REL_PATH@" + NO_DEFAULT_PATH ) endif() diff --git a/third-party/tbb/cmake/utils.cmake b/third-party/tbb/cmake/utils.cmake index 254fe11e..21101989 100644 --- a/third-party/tbb/cmake/utils.cmake +++ b/third-party/tbb/cmake/utils.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023 Intel Corporation +# Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,25 +23,37 @@ macro(tbb_remove_compile_flag flag) endmacro() macro(tbb_install_target target) - install(TARGETS ${target} - EXPORT TBBTargets - LIBRARY - DESTINATION ${CMAKE_INSTALL_LIBDIR} - NAMELINK_SKIP - COMPONENT runtime - RUNTIME - DESTINATION ${CMAKE_INSTALL_BINDIR} - COMPONENT runtime - ARCHIVE - DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT devel) - - if (BUILD_SHARED_LIBS) + if (TBB_INSTALL) install(TARGETS ${target} + EXPORT TBBTargets LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - NAMELINK_ONLY - COMPONENT devel) + NAMELINK_SKIP + COMPONENT runtime + RUNTIME + DESTINATION ${CMAKE_INSTALL_BINDIR} + COMPONENT runtime + ARCHIVE + DESTINATION ${CMAKE_INSTALL_LIBDIR} + COMPONENT devel + FRAMEWORK + DESTINATION ${CMAKE_INSTALL_LIBDIR} + COMPONENT runtime + OPTIONAL) + + if (BUILD_SHARED_LIBS) + install(TARGETS ${target} + LIBRARY + DESTINATION ${CMAKE_INSTALL_LIBDIR} + NAMELINK_ONLY + COMPONENT devel) + endif() + if (MSVC AND BUILD_SHARED_LIBS) + install(FILES $ + DESTINATION ${CMAKE_INSTALL_BINDIR} + COMPONENT devel + OPTIONAL) + endif() endif() endmacro() diff --git a/third-party/tbb/cmake/vars_utils.cmake b/third-party/tbb/cmake/vars_utils.cmake index 989fea26..54a9fda1 100644 --- a/third-party/tbb/cmake/vars_utils.cmake +++ b/third-party/tbb/cmake/vars_utils.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,12 +26,20 @@ get_filename_component(TBB_VARS_TEMPLATE_NAME ${PROJECT_SOURCE_DIR}/integration/ string(REPLACE ".in" "" TBB_VARS_NAME ${TBB_VARS_TEMPLATE_NAME}) macro(tbb_gen_vars target) + if (NOT TBB_BUILD_APPLE_FRAMEWORKS) + set(BIN_PATH $) + else() + # For Apple* frameworks, the binaries are placed in a framework bundle. + # When using an Apple* framework, you refer to the bundle, not the binary inside, so we take the bundle's path and go up one level. + # This path will then be used to generate the vars file, and the contents of the vars file will use the bundle's parent directory. + set(BIN_PATH $/..) + endif() if (${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME}) add_custom_command(TARGET ${target} POST_BUILD COMMAND ${CMAKE_COMMAND} -DBINARY_DIR=${CMAKE_BINARY_DIR} -DSOURCE_DIR=${PROJECT_SOURCE_DIR} - -DBIN_PATH=$ + -DBIN_PATH=${BIN_PATH} -DVARS_TEMPLATE=${TBB_VARS_TEMPLATE} -DVARS_NAME=${TBB_VARS_NAME} -DTBB_INSTALL_VARS=${TBB_INSTALL_VARS} diff --git a/third-party/tbb/doc/GSG/next_steps.rst b/third-party/tbb/doc/GSG/next_steps.rst index 4974265d..aeb4407b 100644 --- a/third-party/tbb/doc/GSG/next_steps.rst +++ b/third-party/tbb/doc/GSG/next_steps.rst @@ -17,6 +17,12 @@ After installing |short_name|, set the environment variables: * On Linux* OS: ``vars.{sh|csh} in /tbb/latest/env`` * On Windows* OS: ``vars.bat in /tbb/latest/env`` +.. tip:: + + oneTBB can coordinate with Intel(R) OpenMP on CPU resources usage + to avoid excessive oversubscription when both runtimes are used within a process. + To enable this feature set up ``TCM_ENABLE`` environment variable to ``1``. + Build and Run a Sample ********************** diff --git a/third-party/tbb/doc/conf.py b/third-party/tbb/doc/conf.py index 87593ebf..19da0a4c 100644 --- a/third-party/tbb/doc/conf.py +++ b/third-party/tbb/doc/conf.py @@ -137,10 +137,14 @@ 'use_issues_button': True, 'use_edit_page_button': True, 'repository_branch': 'master', - 'extra_footer': '

Cookies

' } +if BUILD_TYPE != 'oneapi' and BUILD_TYPE != 'dita': + html_theme_options = { + "extra_footer": "
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.

oneTBB is licensed under Apache License Version 2.0. Refer to the LICENSE file for the full license text and copyright notice.
" + } + # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". diff --git a/third-party/tbb/doc/index/toctree.rst b/third-party/tbb/doc/index/toctree.rst index eda4497e..fba9aee4 100644 --- a/third-party/tbb/doc/index/toctree.rst +++ b/third-party/tbb/doc/index/toctree.rst @@ -9,6 +9,7 @@ /main/intro/intro_os /main/intro/Benefits /main/intro/testing_approach + /main/intro/limitations.rst .. toctree:: diff --git a/third-party/tbb/doc/main/intro/Benefits.rst b/third-party/tbb/doc/main/intro/Benefits.rst index b66ea5d1..5058cc71 100644 --- a/third-party/tbb/doc/main/intro/Benefits.rst +++ b/third-party/tbb/doc/main/intro/Benefits.rst @@ -20,7 +20,7 @@ it with any compiler supporting ISO C++. The library differs from typical threading packages in the following ways: -- **oneTBB enables you to specify logical paralleism instead of +- **oneTBB enables you to specify logical parallelism instead of threads**. Most threading packages require you to specify threads. Programming directly in terms of threads can be tedious and lead to inefficient programs, because threads are low-level, heavy constructs diff --git a/third-party/tbb/doc/main/intro/limitations.rst b/third-party/tbb/doc/main/intro/limitations.rst new file mode 100644 index 00000000..dde9f772 --- /dev/null +++ b/third-party/tbb/doc/main/intro/limitations.rst @@ -0,0 +1,46 @@ +.. _limitations: + +Known Limitations +***************** + +This page outlines the known limitations of oneTBB to help you better understand its capabilities. + +Freestanding Compilation Mode +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Limitation:** oneTBB does not support the freestanding compilation mode. + +**Risk:** Compiling an application that utilizes oneTBB headers using the Intel(R) oneAPI DPC+/C+ Compiler may result in failure on Windows* OS if the ``/Qfreestanding`` compiler option is employed. + +Static Assert +^^^^^^^^^^^^^ + +**Limitation:** A static assert causes the compilation failures in oneTBB headers if the following conditions are satisfied: + + * Compilation is done with Clang 12.0.0 or a more recent version. + * The LLVM standard library is employed, coupled with the use of the ``-ffreestanding`` flag and C++11/14 compiler options. + +**Risk:** The compilation failures. + +Interface Incompatibilities: TBB vs oneTBB +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Limitation:** An application using Parallel STL algorithms in the ``libstdc++`` versions 9 and 10 may fail to compile due to incompatible interface changes between earlier versions of Threading Building Blocks (TBB) and oneAPI Threading Building Blocks (oneTBB). + +**Solution:** Disable support for Parallel STL algorithms by defining ``PSTL_USE_PARALLEL_POLICIES`` (in libstdc++ 9) or ``_GLIBCXX_USE_TBB_PAR_BACKEND`` (in libstdc++ 10) macro to zero before inclusion of the first standard header file in each translation unit. + +Incorrect Installation Location +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Limitation:** On Linux* OS, if oneAPI Threading Building Blocks (oneTBB) or Threading Building Blocks (TBB) are installed in a system folder, such as ``/usr/lib64``, the application may fail to link due to the order in which the linker searches for libraries. + +**Risk:** The issue does not affect the program execution. + +**Solution:** Use the ``-L`` linker option to specify the correct location of oneTBB library. + +``fork()`` Support +^^^^^^^^^^^^^^^^^^^ + +**Limitation:** oneTBB does not support ``fork()``. + +**Solution:** To work-around the issue, consider using ``task_scheduler_handle`` to join oneTBB worker threads before using ``fork()``. diff --git a/third-party/tbb/doc/main/reference/reference.rst b/third-party/tbb/doc/main/reference/reference.rst index ec9fb1e1..833a50ee 100644 --- a/third-party/tbb/doc/main/reference/reference.rst +++ b/third-party/tbb/doc/main/reference/reference.rst @@ -19,6 +19,7 @@ It also describes features that are not included in the oneTBB specification. parallel_for_each_semantics parallel_sort_ranges_extension scalable_memory_pools/malloc_replacement_log + rvalue_reduce Preview features **************** diff --git a/third-party/tbb/doc/main/reference/rvalue_reduce.rst b/third-party/tbb/doc/main/reference/rvalue_reduce.rst new file mode 100644 index 00000000..53880952 --- /dev/null +++ b/third-party/tbb/doc/main/reference/rvalue_reduce.rst @@ -0,0 +1,89 @@ +.. _rvalue_reduce: + +Parallel Reduction for rvalues +============================== + +.. contents:: + :local: + :depth: 1 + +Description +*********** + +|full_name| implementation extends the `ParallelReduceFunc `_ and +`ParallelReduceReduction `_ +to optimize operating with ``rvalues`` using functional form of ``tbb::parallel_reduce`` and ``tbb::parallel_deterministic_reduce`` algorithms. + +API +*** + +Header +------ + +.. code:: cpp + + #include + +ParallelReduceFunc Requirements: Pseudo-Signature, Semantics +------------------------------------------------------------ + +.. cpp:function:: Value Func::operator()(const Range& range, Value&& x) const + +or + +.. cpp:function:: Value Func::operator()(const Range& range, const Value& x) const + + Accumulates the result for a subrange, starting with initial value ``x``. The ``Range`` type must meet the `Range requirements _`. + The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm `_. + + If both ``rvalue`` and ``lvalue`` forms are provided, the ``rvalue`` is preferred. + +ParallelReduceReduction Requirements: Pseudo-Signature, Semantics +----------------------------------------------------------------- + +.. cpp:function:: Value Reduction::operator()(Value&& x, Value&& y) const + +or + +.. cpp:function:: Value Reduction::operator()(const Value& x, const Value& y) const + + Combines the ``x`` and ``y`` results. The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm `_. + + If both ``rvalue`` and ``lvalue`` forms are provided, the ``rvalue`` is preferred. + +Example +******* + +.. code:: cpp + // C++17 + #include + #include + #include + #include + + int main() { + std::vector> sets = ...; + + oneapi::tbb::parallel_reduce(oneapi::tbb::blocked_range(0, sets.size()), + std::set{}, // identity element - empty set + [&](const oneapi::tbb::blocked_range& range, std::set&& value) { + for (size_t i = range.begin(); i < range.end(); ++i) { + // Having value as a non-const rvalue reference allows to efficiently + // transfer nodes from sets[i] without copying/moving the data + value.merge(std::move(sets[i])); + } + return value; + }, + [&](std::set&& x, std::set&& y) { + x.merge(std::move(y)); + return x; + } + ); + } + +.. rubric:: See also + +* `oneapi::tbb::parallel_reduce specification `_ +* `oneapi::tbb::parallel_deterministic_reduce specification `_ +* `ParallelReduceFunc specification `_ +* `ParallelReduceReduction specification `_ diff --git a/third-party/tbb/doc/main/tbb_userguide/Edges.rst b/third-party/tbb/doc/main/tbb_userguide/Edges.rst index 5cdaa895..ea4c214b 100644 --- a/third-party/tbb/doc/main/tbb_userguide/Edges.rst +++ b/third-party/tbb/doc/main/tbb_userguide/Edges.rst @@ -37,8 +37,8 @@ it and then connect that to the first node with an edge. g.wait_for_all(); -Now there are two ``function_node``s, ``n`` and ``m``. The call to ``make_edge`` creates -an edge from ``n`` to ``m``. The node n is created with unlimited concurrency, +Now there are two ``function_node`` ``s``, ``n`` and ``m``. The call to ``make_edge`` creates +an edge from ``n`` to ``m``. The node ``n`` is created with unlimited concurrency, while ``m`` has a concurrency limit of 1. The invocations of ``n`` can all proceed in parallel, while the invocations of ``m`` will be serialized. Because there is an edge from ``n`` to ``m``, each value ``v``, returned by ``n``, will diff --git a/third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Reservation.rst b/third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Reservation.rst index 8487c449..44fc2f0a 100644 --- a/third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Reservation.rst +++ b/third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Reservation.rst @@ -63,7 +63,7 @@ messages and do not support ``try_get()`` or ``try_reserve()``. broadcast_node bn(g); buffer_node buf1(g); buffer_node buf2(g); - typedef join_node reserving> join_type; + typedef join_node, reserving> join_type; join_type jn(g); buffer_node buf_out(g); join_type::output_type tuple_out; @@ -71,9 +71,9 @@ messages and do not support ``try_get()`` or ``try_reserve()``. // join_node predecessors are both reservable buffer_nodes - make_edge(buf1,input_port<0>jn)); - make_edge(bn,input_port<0>jn)); // attach a broadcast_node - make_edge(buf2,input_port<1>jn)); + make_edge(buf1,input_port<0>(jn)); + make_edge(bn,input_port<0>(jn)); // attach a broadcast_node + make_edge(buf2,input_port<1>(jn)); make_edge(jn, buf_out); bn.try_put(2); buf1.try_put(3); @@ -81,7 +81,7 @@ messages and do not support ``try_get()`` or ``try_reserve()``. buf2.try_put(7); g.wait_for_all(); while (buf_out.try_get(tuple_out)) { - printf("join_node output == (%d,%d)\n",get<0>tuple_out), get<1>tuple_out) ); + printf("join_node output == (%d,%d)\n",get<0>(tuple_out), get<1>(tuple_out) ); } if(buf1.try_get(icnt)) printf("buf1 had %d\n", icnt); else printf("buf1 was empty\n"); diff --git a/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Mixing_Two_Runtimes.rst b/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Mixing_Two_Runtimes.rst index 57582aac..8d467fb6 100644 --- a/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Mixing_Two_Runtimes.rst +++ b/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Mixing_Two_Runtimes.rst @@ -46,3 +46,4 @@ TBB possible output: TBB: RML private TBB: Tools support disabled +.. note:: The ``tbbmalloc`` library in oneTBB is fully binary compatible with TBB. diff --git a/third-party/tbb/doc/main/tbb_userguide/Package_Contents.rst b/third-party/tbb/doc/main/tbb_userguide/Package_Contents_os.rst similarity index 93% rename from third-party/tbb/doc/main/tbb_userguide/Package_Contents.rst rename to third-party/tbb/doc/main/tbb_userguide/Package_Contents_os.rst index 30d75c09..2d9e0a2f 100644 --- a/third-party/tbb/doc/main/tbb_userguide/Package_Contents.rst +++ b/third-party/tbb/doc/main/tbb_userguide/Package_Contents_os.rst @@ -14,4 +14,5 @@ and macOS\* operating systems as described in this section. ../tbb_userguide/Scalable_Memory_Allocator ../tbb_userguide/Windows_OS_ug ../tbb_userguide/Linux_OS - ../tbb_userguide/Mac_OS \ No newline at end of file + ../tbb_userguide/Mac_OS + diff --git a/third-party/tbb/doc/main/tbb_userguide/Working_on_the_Assembly_Line_pipeline.rst b/third-party/tbb/doc/main/tbb_userguide/Working_on_the_Assembly_Line_pipeline.rst index 05786fbd..15299b51 100644 --- a/third-party/tbb/doc/main/tbb_userguide/Working_on_the_Assembly_Line_pipeline.rst +++ b/third-party/tbb/doc/main/tbb_userguide/Working_on_the_Assembly_Line_pipeline.rst @@ -25,7 +25,7 @@ pipeline. .. CAUTION:: Since the body object provided to the filters of the - ``parallel_pipline`` might be copied, its ``operator()`` should not + ``parallel_pipeline`` might be copied, its ``operator()`` should not modify the body. Otherwise the modification might or might not become visible to the thread that invoked ``parallel_pipeline``, depending upon whether ``operator()`` is acting on the original or a copy. As a diff --git a/third-party/tbb/doc/main/tbb_userguide/concurrent_hash_map.rst b/third-party/tbb/doc/main/tbb_userguide/concurrent_hash_map.rst index cd8482ff..8d9ba3a1 100644 --- a/third-party/tbb/doc/main/tbb_userguide/concurrent_hash_map.rst +++ b/third-party/tbb/doc/main/tbb_userguide/concurrent_hash_map.rst @@ -30,14 +30,14 @@ string occurs in the array ``Data``. // Structure that defines hashing and comparison operations for user's type. struct MyHashCompare { - static size_t hash( const string& x ) { + size_t hash( const string& x ) const { size_t h = 0; for( const char* s = x.c_str(); *s; ++s ) h = (h*17)^*s; return h; } //! True if strings are equal - static bool equal( const string& x, const string& y ) { + bool equal( const string& x, const string& y ) const { return x==y; } }; @@ -128,4 +128,4 @@ any other extant accesses on ``key``. .. toctree:: :maxdepth: 4 - ../tbb_userguide/More_on_HashCompare \ No newline at end of file + ../tbb_userguide/More_on_HashCompare diff --git a/third-party/tbb/doc/main/tbb_userguide/title.rst b/third-party/tbb/doc/main/tbb_userguide/title.rst index c073acfc..8adb7093 100644 --- a/third-party/tbb/doc/main/tbb_userguide/title.rst +++ b/third-party/tbb/doc/main/tbb_userguide/title.rst @@ -8,7 +8,7 @@ .. toctree:: :maxdepth: 4 - ../tbb_userguide/Package_Contents + ../tbb_userguide/Package_Contents_os ../tbb_userguide/Parallelizing_Simple_Loops_os ../tbb_userguide/Parallelizing_Complex_Loops ../tbb_userguide/Flow_Graph diff --git a/third-party/tbb/doc/make.bat b/third-party/tbb/doc/make.bat index 557ecc5b..14d399a5 100644 --- a/third-party/tbb/doc/make.bat +++ b/third-party/tbb/doc/make.bat @@ -25,7 +25,7 @@ REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) -set SOURCEDIR=doc +set SOURCEDIR=. set BUILDDIR=build if "%1" == "" goto help diff --git a/third-party/tbb/examples/CMakeLists.txt b/third-party/tbb/examples/CMakeLists.txt index 43877e42..16f1c455 100644 --- a/third-party/tbb/examples/CMakeLists.txt +++ b/third-party/tbb/examples/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(tbb_examples CXX) @@ -66,6 +66,7 @@ tbb_add_example(parallel_for_each parallel_preorder) tbb_add_example(parallel_pipeline square) tbb_add_example(parallel_reduce convex_hull) +tbb_add_example(parallel_reduce pi) tbb_add_example(parallel_reduce primes) tbb_add_example(task_arena fractal) @@ -73,3 +74,5 @@ tbb_add_example(task_arena fractal) tbb_add_example(task_group sudoku) tbb_add_example(test_all fibonacci) + +tbb_add_example(migration recursive_fibonacci) diff --git a/third-party/tbb/examples/README.md b/third-party/tbb/examples/README.md index 318d2d93..037ca4d4 100644 --- a/third-party/tbb/examples/README.md +++ b/third-party/tbb/examples/README.md @@ -19,6 +19,7 @@ This directory contains example usages of oneAPI Threading Building Blocks. | parallel_for_each/parallel_preorder | Parallel preorder traversal of a graph. | parallel_pipeline/square | Another string transformation example that squares numbers read from a file. | parallel_reduce/convex_hull | Parallel version of convex hull algorithm (quick hull). +| parallel_reduce/pi | Parallel version of calculating π by numerical integration. | parallel_reduce/primes | Parallel version of the Sieve of Eratosthenes. | task_arena/fractal |The example calculates two classical Mandelbrot fractals with different concurrency limits. | task_group/sudoku | Compute all solutions for a Sudoku board. diff --git a/third-party/tbb/examples/common/gui/CMakeLists.txt b/third-party/tbb/examples/common/gui/CMakeLists.txt index 8bee0a83..ea8b0060 100644 --- a/third-party/tbb/examples/common/gui/CMakeLists.txt +++ b/third-party/tbb/examples/common/gui/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) set(EXAMPLES_UI_MODE "con" CACHE STRING "EXAMPLES_UI_MODE") diff --git a/third-party/tbb/examples/concurrent_hash_map/count_strings/CMakeLists.txt b/third-party/tbb/examples/concurrent_hash_map/count_strings/CMakeLists.txt index 14d25fa7..77efd2f6 100644 --- a/third-party/tbb/examples/concurrent_hash_map/count_strings/CMakeLists.txt +++ b/third-party/tbb/examples/concurrent_hash_map/count_strings/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(count_strings CXX) diff --git a/third-party/tbb/examples/concurrent_hash_map/count_strings/count_strings.cpp b/third-party/tbb/examples/concurrent_hash_map/count_strings/count_strings.cpp index 2b563cd5..0a230846 100644 --- a/third-party/tbb/examples/concurrent_hash_map/count_strings/count_strings.cpp +++ b/third-party/tbb/examples/concurrent_hash_map/count_strings/count_strings.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -42,7 +42,7 @@ class hash> { (sizeof(std::size_t) == sizeof(unsigned)) ? 2654435769U : 11400714819323198485ULL); std::hash char_hash; -}; // strunt hash +}; // struct hash } // namespace std diff --git a/third-party/tbb/examples/concurrent_priority_queue/shortpath/CMakeLists.txt b/third-party/tbb/examples/concurrent_priority_queue/shortpath/CMakeLists.txt index 8a6d78a0..624a5928 100644 --- a/third-party/tbb/examples/concurrent_priority_queue/shortpath/CMakeLists.txt +++ b/third-party/tbb/examples/concurrent_priority_queue/shortpath/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(shortpath CXX) diff --git a/third-party/tbb/examples/getting_started/README.md b/third-party/tbb/examples/getting_started/README.md index 4ad29f06..def9429a 100644 --- a/third-party/tbb/examples/getting_started/README.md +++ b/third-party/tbb/examples/getting_started/README.md @@ -1,5 +1,5 @@ # Code Samples of oneAPI Threading Building Blocks (oneTBB) -This directory contains the examples referenced by the [oneAPI Threading Building Blocks Get Started Guide](https://software.intel.com/content/www/us/en/develop/documentation/get-started-with-onetbb/top.html) +This directory contains the examples referenced by the [oneAPI Threading Building Blocks Get Started Guide](https://www.intel.com/content/www/us/en/docs/onetbb/get-started-guide/current/overview.html) | Code sample name | Description |:--- |:--- diff --git a/third-party/tbb/examples/getting_started/sub_string_finder/CMakeLists.txt b/third-party/tbb/examples/getting_started/sub_string_finder/CMakeLists.txt index cf4e6a1b..91792dde 100644 --- a/third-party/tbb/examples/getting_started/sub_string_finder/CMakeLists.txt +++ b/third-party/tbb/examples/getting_started/sub_string_finder/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(sub_string_finder_simple CXX) project(sub_string_finder_extended CXX) diff --git a/third-party/tbb/examples/getting_started/sub_string_finder/README.md b/third-party/tbb/examples/getting_started/sub_string_finder/README.md index 0c32f223..f57a92ac 100644 --- a/third-party/tbb/examples/getting_started/sub_string_finder/README.md +++ b/third-party/tbb/examples/getting_started/sub_string_finder/README.md @@ -1,5 +1,5 @@ # Sub_string_finder sample -An example that uses the `parallel_for` template in a substring matching program. The [oneAPI Threading Building Blocks [](https://software.intel.com/content/www/us/en/develop/documentation/get-started-with-onetbb/top.html) describes this example. +An example that uses the `parallel_for` template in a substring matching program. The [oneAPI Threading Building Blocks Get Started Guide](https://www.intel.com/content/www/us/en/docs/onetbb/get-started-guide/current/overview.html) describes this example. For each position in a string, the program displays the length of the largest matching substring elsewhere in the string. The program also displays the location of a largest match for each position. Consider the string "babba" as an example. Starting at position 0, "ba" is the largest substring with a match elsewhere in the string (position 3). diff --git a/third-party/tbb/examples/graph/binpack/CMakeLists.txt b/third-party/tbb/examples/graph/binpack/CMakeLists.txt index 5fc979a5..3d3b7921 100644 --- a/third-party/tbb/examples/graph/binpack/CMakeLists.txt +++ b/third-party/tbb/examples/graph/binpack/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(binpack CXX) diff --git a/third-party/tbb/examples/graph/cholesky/CMakeLists.txt b/third-party/tbb/examples/graph/cholesky/CMakeLists.txt index eeb2649a..2e8273ae 100644 --- a/third-party/tbb/examples/graph/cholesky/CMakeLists.txt +++ b/third-party/tbb/examples/graph/cholesky/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(cholesky CXX) diff --git a/third-party/tbb/examples/graph/dining_philosophers/CMakeLists.txt b/third-party/tbb/examples/graph/dining_philosophers/CMakeLists.txt index 95f7a483..d46af59b 100644 --- a/third-party/tbb/examples/graph/dining_philosophers/CMakeLists.txt +++ b/third-party/tbb/examples/graph/dining_philosophers/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(dining_philosophers CXX) diff --git a/third-party/tbb/examples/graph/fgbzip2/CMakeLists.txt b/third-party/tbb/examples/graph/fgbzip2/CMakeLists.txt index 9a53a1d0..7a9142a5 100644 --- a/third-party/tbb/examples/graph/fgbzip2/CMakeLists.txt +++ b/third-party/tbb/examples/graph/fgbzip2/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(fgbzip2 CXX) @@ -38,7 +38,7 @@ if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL IntelLLVM) target_compile_options(fgbzip2 PRIVATE -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE) endif() -if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL Intel) +if (MSVC AND (CMAKE_CXX_COMPILER_ID STREQUAL Intel OR CMAKE_CXX_COMPILER_ID STREQUAL IntelLLVM)) if (COMMAND target_link_options) target_link_options(fgbzip2 PRIVATE /FORCE:MULTIPLE /INCREMENTAL:NO) else() diff --git a/third-party/tbb/examples/graph/logic_sim/CMakeLists.txt b/third-party/tbb/examples/graph/logic_sim/CMakeLists.txt index b33f9156..99e1cc8f 100644 --- a/third-party/tbb/examples/graph/logic_sim/CMakeLists.txt +++ b/third-party/tbb/examples/graph/logic_sim/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(logic_sim CXX) diff --git a/third-party/tbb/examples/graph/som/CMakeLists.txt b/third-party/tbb/examples/graph/som/CMakeLists.txt index 6e759331..c2dd1a80 100644 --- a/third-party/tbb/examples/graph/som/CMakeLists.txt +++ b/third-party/tbb/examples/graph/som/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) include(../../common/cmake/common.cmake) project(som CXX) diff --git a/third-party/tbb/examples/migration/README.md b/third-party/tbb/examples/migration/README.md new file mode 100644 index 00000000..7bfca1b6 --- /dev/null +++ b/third-party/tbb/examples/migration/README.md @@ -0,0 +1,6 @@ +# Code Samples of oneAPI Threading Building Blocks (oneTBB) +Examples of migrating from TBB APIs to the oneTBB APIs. + +| Code sample name | Description +|:--- |:--- +| recursive_fibonacci | Compute Fibonacci number in recursive way. diff --git a/third-party/tbb/examples/migration/recursive_fibonacci/CMakeLists.txt b/third-party/tbb/examples/migration/recursive_fibonacci/CMakeLists.txt new file mode 100644 index 00000000..57e027cf --- /dev/null +++ b/third-party/tbb/examples/migration/recursive_fibonacci/CMakeLists.txt @@ -0,0 +1,40 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.5) + +project(recursive_fibonacci CXX) + +include(../../common/cmake/common.cmake) + +set_common_project_settings(tbb) + +add_executable(recursive_fibonacci fibonacci.cpp) +target_link_libraries(recursive_fibonacci + TBB::tbb + Threads::Threads + $<$:rt>) # Link "rt" library on Linux +target_compile_options(recursive_fibonacci PRIVATE ${TBB_CXX_STD_FLAG}) + +set(EXECUTABLE "$") + +# Parameters of executable N C I: +# `N` - specifies the fibonacci number which would be calculated. +# `C` - cutoff that will be used to stop recursive split. +# `I` - number of iteration to measure benchmark time. +set(ARGS 30 16 20 1) +set(PERF_ARGS 50 5 20) + +add_execution_target(run_recursive_fibonacci recursive_fibonacci ${EXECUTABLE} "${ARGS}") +add_execution_target(perf_run_recursive_fibonacci recursive_fibonacci ${EXECUTABLE} "${PERF_ARGS}") diff --git a/third-party/tbb/examples/migration/recursive_fibonacci/README.md b/third-party/tbb/examples/migration/recursive_fibonacci/README.md new file mode 100644 index 00000000..1f0341c1 --- /dev/null +++ b/third-party/tbb/examples/migration/recursive_fibonacci/README.md @@ -0,0 +1,23 @@ +# Fibonacci sample +This directory contains an example that computes Fibonacci numbers using emulation for TBB Task API. + +## Building the example +``` +cmake +cmake --build . +``` + +## Running the sample +### Predefined make targets +* `make run_recursive_fibonacci` - executes the example with predefined parameters (extended testing enabled). +* `make perf_run_recursive_fibonacci` - executes the example with suggested parameters to measure the oneTBB performance. + +### Application parameters +Usage: +``` +recursive_fibonacci N C I T +``` +* `N` - specifies the fibonacci number which would be calculated. +* `C` - cutoff that will be used to stop recursive split. +* `I` - number of iteration to measure benchmark time. +* `T` - enables extended testing (recycle task in a loop). diff --git a/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci.cpp b/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci.cpp new file mode 100644 index 00000000..e4a7c12e --- /dev/null +++ b/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci.cpp @@ -0,0 +1,61 @@ +/* + Copyright (c) 2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "fibonacci_single_task.h" +#include "fibonacci_two_tasks.h" + +#include +#include +#include + +int cutoff; +bool testing_enabled; + +template +std::pair measure(F&& f, + int number, + unsigned long ntrial) { + std::vector times; + + unsigned long result; + for (unsigned long i = 0; i < ntrial; ++i) { + auto t1 = std::chrono::steady_clock::now(); + result = f(number); + auto t2 = std::chrono::steady_clock::now(); + + auto time = std::chrono::duration_cast(t2 - t1).count(); + times.push_back(time); + } + + return std::make_pair( + result, + static_cast(std::accumulate(times.begin(), times.end(), 0) / times.size())); +} + +int main(int argc, char* argv[]) { + int numbers = argc > 1 ? strtol(argv[1], nullptr, 0) : 50; + cutoff = argc > 2 ? strtol(argv[2], nullptr, 0) : 16; + unsigned long ntrial = argc > 3 ? (unsigned long)strtoul(argv[3], nullptr, 0) : 20; + testing_enabled = argc > 4 ? (bool)strtol(argv[4], nullptr, 0) : false; + + auto res = measure(fibonacci_two_tasks, numbers, ntrial); + std::cout << "Fibonacci two tasks impl N = " << res.first << " Avg time = " << res.second + << " ms" << std::endl; + + res = measure(fibonacci_single_task, numbers, ntrial); + std::cout << "Fibonacci single task impl N = " << res.first << " Avg time = " << res.second + << " ms" << std::endl; +} diff --git a/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_single_task.h b/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_single_task.h new file mode 100644 index 00000000..dae8895b --- /dev/null +++ b/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_single_task.h @@ -0,0 +1,97 @@ +/* + Copyright (c) 2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef SINGLE_TASK_HEADER +#define SINGLE_TASK_HEADER + +#include "task_emulation_layer.h" + +#include +#include +#include + +extern int cutoff; +extern bool testing_enabled; + +long serial_fib_1(int n) { + return n < 2 ? n : serial_fib_1(n - 1) + serial_fib_1(n - 2); +} + +struct single_fib_task : task_emulation::base_task { + enum class state { + compute, + sum + }; + + single_fib_task(int n, int* x) : n(n), x(x), s(state::compute) + {} + + task_emulation::base_task* execute() override { + task_emulation::base_task* bypass = nullptr; + switch (s) { + case state::compute : { + bypass = compute_impl(); + break; + } + case state::sum : { + *x = x_l + x_r; + + if (testing_enabled) { + if (n == cutoff && num_recycles > 0) { + --num_recycles; + bypass = compute_impl(); + } + } + + break; + } + } + return bypass; + } + + task_emulation::base_task* compute_impl() { + task_emulation::base_task* bypass = nullptr; + if (n < cutoff) { + *x = serial_fib_1(n); + } + else { + bypass = this->allocate_child_and_increment(n - 2, &x_r); + task_emulation::run_task(this->allocate_child_and_increment(n - 1, &x_l)); + + // Recycling + this->s = state::sum; + this->recycle_as_continuation(); + } + return bypass; + } + + + int n; + int* x; + state s; + + int x_l{ 0 }, x_r{ 0 }; + int num_recycles{5}; +}; + +int fibonacci_single_task(int n) { + int sum{}; + tbb::task_group tg; + task_emulation::run_and_wait(tg, task_emulation::allocate_root_task(/* for root task = */ tg, n, &sum)); + return sum; +} + +#endif // SINGLE_TASK_HEADER diff --git a/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_two_tasks.h b/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_two_tasks.h new file mode 100644 index 00000000..5d7fd022 --- /dev/null +++ b/third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_two_tasks.h @@ -0,0 +1,79 @@ +/* + Copyright (c) 2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef TWO_TASKS_HEADER +#define TWO_TASKS_HEADER + +#include "task_emulation_layer.h" + +#include +#include +#include +#include + +extern int cutoff; + +long serial_fib(int n) { + return n < 2 ? n : serial_fib(n - 1) + serial_fib(n - 2); +} + +struct fib_continuation : task_emulation::base_task { + fib_continuation(int& s) : sum(s) {} + + task_emulation::base_task* execute() override { + sum = x + y; + return nullptr; + } + + int x{ 0 }, y{ 0 }; + int& sum; +}; + +struct fib_computation : task_emulation::base_task { + fib_computation(int n, int* x) : n(n), x(x) {} + + task_emulation::base_task* execute() override { + task_emulation::base_task* bypass = nullptr; + if (n < cutoff) { + *x = serial_fib(n); + } + else { + // Continuation passing + auto& c = *this->allocate_continuation(/* children_counter = */ 2, *x); + task_emulation::run_task(c.create_child(n - 1, &c.x)); + + // Recycling + this->recycle_as_child_of(c); + n = n - 2; + x = &c.y; + bypass = this; + } + return bypass; + } + + int n; + int* x; +}; + +int fibonacci_two_tasks(int n) { + int sum{}; + tbb::task_group tg; + tg.run_and_wait( + task_emulation::create_root_task(/* for root task = */ tg, n, &sum)); + return sum; +} + +#endif // TWO_TASKS_HEADER diff --git a/third-party/tbb/examples/migration/recursive_fibonacci/task_emulation_layer.h b/third-party/tbb/examples/migration/recursive_fibonacci/task_emulation_layer.h new file mode 100644 index 00000000..7252d447 --- /dev/null +++ b/third-party/tbb/examples/migration/recursive_fibonacci/task_emulation_layer.h @@ -0,0 +1,225 @@ +/* + Copyright (c) 2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_task_emulation_layer_H +#define __TBB_task_emulation_layer_H + +#include "tbb/task_group.h" +#include "tbb/task_arena.h" + +#include + +namespace task_emulation { + +struct task_group_pool { + task_group_pool() : pool_size(std::thread::hardware_concurrency()), task_submitters(new tbb::task_group[pool_size]) {} + + ~task_group_pool() { + for (std::size_t i = 0; i < pool_size; ++i) { + task_submitters[i].wait(); + } + + delete [] task_submitters; + } + + tbb::task_group& operator[] (std::size_t idx) { return task_submitters[idx]; } + + const std::size_t pool_size; + tbb::task_group* task_submitters; +}; + +static task_group_pool tg_pool; + +class base_task { +public: + base_task() = default; + + base_task(const base_task& t) : m_type(t.m_type), m_parent(t.m_parent), m_child_counter(t.m_child_counter.load()) + {} + + virtual ~base_task() = default; + + void operator() () const { + task_type type_snapshot = m_type; + + base_task* bypass = const_cast(this)->execute(); + + if (m_parent && m_type != task_type::recycled) { + if (m_parent->remove_child_reference() == 0) { + m_parent->operator()(); + } + } + + if (m_type == task_type::allocated) { + delete this; + } + + if (bypass != nullptr) { + m_type = type_snapshot; + + // Bypass is not supported by task_emulation and next_task executed directly. + // However, the old-TBB bypass behavior can be achieved with + // `return task_group::defer()` (check Migration Guide). + // Consider submit another task if recursion call is not acceptable + // i.e. instead of Direct Body call + // submit task_emulation::run_task(); + bypass->operator()(); + } + } + + virtual base_task* execute() = 0; + + template + C* allocate_continuation(std::uint64_t ref, Args&&... args) { + C* continuation = new C{std::forward(args)...}; + continuation->m_type = task_type::allocated; + continuation->reset_parent(reset_parent()); + continuation->m_child_counter = ref; + return continuation; + } + + template + F create_child(Args&&... args) { + return create_child_impl(std::forward(args)...); + } + + template + F create_child_and_increment(Args&&... args) { + add_child_reference(); + return create_child_impl(std::forward(args)...); + } + + template + F* allocate_child(Args&&... args) { + return allocate_child_impl(std::forward(args)...); + } + + template + F* allocate_child_and_increment(Args&&... args) { + add_child_reference(); + return allocate_child_impl(std::forward(args)...); + } + + template + void recycle_as_child_of(C& c) { + m_type = task_type::recycled; + reset_parent(&c); + } + + void recycle_as_continuation() { + m_type = task_type::recycled; + } + + void add_child_reference() { + ++m_child_counter; + } + + std::uint64_t remove_child_reference() { + return --m_child_counter; + } + +protected: + enum class task_type { + stack_based, + allocated, + recycled + }; + + mutable task_type m_type; + +private: + template + friend F create_root_task(tbb::task_group& tg, Args&&... args); + + template + friend F* allocate_root_task(tbb::task_group& tg, Args&&... args); + + template + F create_child_impl(Args&&... args) { + F obj{std::forward(args)...}; + obj.m_type = task_type::stack_based; + obj.reset_parent(this); + return obj; + } + + template + F* allocate_child_impl(Args&&... args) { + F* obj = new F{std::forward(args)...}; + obj->m_type = task_type::allocated; + obj->reset_parent(this); + return obj; + } + + base_task* reset_parent(base_task* ptr = nullptr) { + auto p = m_parent; + m_parent = ptr; + return p; + } + + base_task* m_parent{nullptr}; + std::atomic m_child_counter{0}; +}; + +class root_task : public base_task { +public: + root_task(tbb::task_group& tg) : m_tg(tg), m_callback(m_tg.defer([] { /* Create empty callback to preserve reference for wait. */})) { + add_child_reference(); + m_type = base_task::task_type::allocated; + } + +private: + base_task* execute() override { + m_tg.run(std::move(m_callback)); + return nullptr; + } + + tbb::task_group& m_tg; + tbb::task_handle m_callback; +}; + +template +F create_root_task(tbb::task_group& tg, Args&&... args) { + F obj{std::forward(args)...}; + obj.m_type = base_task::task_type::stack_based; + obj.reset_parent(new root_task{tg}); + return obj; +} + +template +F* allocate_root_task(tbb::task_group& tg, Args&&... args) { + F* obj = new F{std::forward(args)...}; + obj->m_type = base_task::task_type::allocated; + obj->reset_parent(new root_task{tg}); + return obj; +} + +template +void run_task(F&& f) { + tg_pool[tbb::this_task_arena::current_thread_index()].run(std::forward(f)); +} + +template +void run_task(F* f) { + tg_pool[tbb::this_task_arena::current_thread_index()].run(std::ref(*f)); +} + +template +void run_and_wait(tbb::task_group& tg, F* f) { + tg.run_and_wait(std::ref(*f)); +} +} // namespace task_emulation + +#endif // __TBB_task_emulation_layer_H diff --git a/third-party/tbb/examples/parallel_for/game_of_life/CMakeLists.txt b/third-party/tbb/examples/parallel_for/game_of_life/CMakeLists.txt index 47f7ca7b..59634242 100644 --- a/third-party/tbb/examples/parallel_for/game_of_life/CMakeLists.txt +++ b/third-party/tbb/examples/parallel_for/game_of_life/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(game_of_life CXX) diff --git a/third-party/tbb/examples/parallel_for/polygon_overlay/CMakeLists.txt b/third-party/tbb/examples/parallel_for/polygon_overlay/CMakeLists.txt index cb0475e2..a45aaa68 100644 --- a/third-party/tbb/examples/parallel_for/polygon_overlay/CMakeLists.txt +++ b/third-party/tbb/examples/parallel_for/polygon_overlay/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(polygon_overlay CXX) diff --git a/third-party/tbb/examples/parallel_for/seismic/CMakeLists.txt b/third-party/tbb/examples/parallel_for/seismic/CMakeLists.txt index 9236176b..61675f19 100644 --- a/third-party/tbb/examples/parallel_for/seismic/CMakeLists.txt +++ b/third-party/tbb/examples/parallel_for/seismic/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(seismic CXX) diff --git a/third-party/tbb/examples/parallel_for/tachyon/CMakeLists.txt b/third-party/tbb/examples/parallel_for/tachyon/CMakeLists.txt index 9dc0f83c..752fddef 100644 --- a/third-party/tbb/examples/parallel_for/tachyon/CMakeLists.txt +++ b/third-party/tbb/examples/parallel_for/tachyon/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(tachyon CXX) @@ -39,7 +39,6 @@ add_executable( src/imageio.cpp src/imap.cpp src/intersect.cpp - src/jpeg.cpp src/light.cpp src/objbound.cpp src/parse.cpp diff --git a/third-party/tbb/examples/parallel_for/tachyon/src/imageio.cpp b/third-party/tbb/examples/parallel_for/tachyon/src/imageio.cpp index a379c4dc..c1c9d762 100644 --- a/third-party/tbb/examples/parallel_for/tachyon/src/imageio.cpp +++ b/third-party/tbb/examples/parallel_for/tachyon/src/imageio.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -47,7 +47,7 @@ * imageio.cpp - This file deals with reading/writing image files */ -/* For our puposes, we're interested only in the 3 byte per pixel 24 bit +/* For our purposes, we're interested only in the 3 byte per pixel 24 bit * truecolor sort of file.. */ @@ -59,7 +59,6 @@ #include "imageio.hpp" #include "ppm.hpp" /* PPM files */ #include "tgafile.hpp" /* Truevision Targa files */ -#include "jpeg.hpp" /* JPEG files */ static int fakeimage(char *name, int *xres, int *yres, unsigned char **imgdata) { int i, imgsize; @@ -90,7 +89,7 @@ int readimage(rawimage *img) { rc = readtga(name, &xres, &yres, &imgdata); } else if (strstr(name, ".jpg")) { - rc = readjpeg(name, &xres, &yres, &imgdata); + rc = IMAGEUNSUP; } else if (strstr(name, ".gif")) { rc = IMAGEUNSUP; diff --git a/third-party/tbb/examples/parallel_for/tachyon/src/imageio.hpp b/third-party/tbb/examples/parallel_for/tachyon/src/imageio.hpp index 31864006..8ad8c4be 100644 --- a/third-party/tbb/examples/parallel_for/tachyon/src/imageio.hpp +++ b/third-party/tbb/examples/parallel_for/tachyon/src/imageio.hpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -49,7 +49,7 @@ * $Id: imageio.h,v 1.2 2007-02-22 17:54:15 Exp $ */ -/* For our puposes, we're interested only in the 3 byte per pixel 24 bit +/* For our purposes, we're interested only in the 3 byte per pixel 24 bit truecolor sort of file.. */ #define IMAGENOERR 0 /* no error */ diff --git a/third-party/tbb/examples/parallel_for/tachyon/src/jpeg.cpp b/third-party/tbb/examples/parallel_for/tachyon/src/jpeg.cpp index 5a5885f4..ed066364 100644 --- a/third-party/tbb/examples/parallel_for/tachyon/src/jpeg.cpp +++ b/third-party/tbb/examples/parallel_for/tachyon/src/jpeg.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -49,7 +49,7 @@ /* * This code requires support from the Independent JPEG Group's libjpeg. - * For our puposes, we're interested only in the 3 byte per pixel 24 bit + * For our purposes, we're interested only in the 3 byte per pixel 24 bit * RGB output. Probably won't implement any decent checking at this point. */ diff --git a/third-party/tbb/examples/parallel_for/tachyon/src/ppm.cpp b/third-party/tbb/examples/parallel_for/tachyon/src/ppm.cpp index fb898369..4802f76c 100644 --- a/third-party/tbb/examples/parallel_for/tachyon/src/ppm.cpp +++ b/third-party/tbb/examples/parallel_for/tachyon/src/ppm.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -47,7 +47,7 @@ * ppm.cpp - This file deals with PPM format image files (reading/writing) */ -/* For our puposes, we're interested only in the 3 byte per pixel 24 bit +/* For our purposes, we're interested only in the 3 byte per pixel 24 bit truecolor sort of file.. Probably won't implement any decent checking at this point, probably choke on things like the # comments.. */ diff --git a/third-party/tbb/examples/parallel_for/tachyon/src/ppm.hpp b/third-party/tbb/examples/parallel_for/tachyon/src/ppm.hpp index 7c09b0c2..cb306b91 100644 --- a/third-party/tbb/examples/parallel_for/tachyon/src/ppm.hpp +++ b/third-party/tbb/examples/parallel_for/tachyon/src/ppm.hpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -49,7 +49,7 @@ * $Id: ppm.h,v 1.2 2007-02-22 17:54:16 Exp $ */ -/* For our puposes, we're interested only in the 3 byte per pixel 24 bit +/* For our purposes, we're interested only in the 3 byte per pixel 24 bit truecolor sort of file.. Probably won't implement any decent checking at this point, probably choke on things like the # comments.. */ diff --git a/third-party/tbb/examples/parallel_for_each/parallel_preorder/CMakeLists.txt b/third-party/tbb/examples/parallel_for_each/parallel_preorder/CMakeLists.txt index 235604ab..8e98d360 100644 --- a/third-party/tbb/examples/parallel_for_each/parallel_preorder/CMakeLists.txt +++ b/third-party/tbb/examples/parallel_for_each/parallel_preorder/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(parallel_preorder CXX) diff --git a/third-party/tbb/examples/parallel_pipeline/square/CMakeLists.txt b/third-party/tbb/examples/parallel_pipeline/square/CMakeLists.txt index a32eaaf8..184c787e 100644 --- a/third-party/tbb/examples/parallel_pipeline/square/CMakeLists.txt +++ b/third-party/tbb/examples/parallel_pipeline/square/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(square CXX) diff --git a/third-party/tbb/examples/parallel_reduce/README.md b/third-party/tbb/examples/parallel_reduce/README.md index 481d8e18..0dba80ca 100644 --- a/third-party/tbb/examples/parallel_reduce/README.md +++ b/third-party/tbb/examples/parallel_reduce/README.md @@ -4,4 +4,5 @@ Examples using `parallel_reduce` algorithm. | Code sample name | Description |:--- |:--- | convex_hull | Parallel version of convex hull algorithm (quick hull). +| pi | Parallel version of calculating π by numerical integration. | primes | Parallel version of the Sieve of Eratosthenes. diff --git a/third-party/tbb/examples/parallel_reduce/convex_hull/CMakeLists.txt b/third-party/tbb/examples/parallel_reduce/convex_hull/CMakeLists.txt index de32d1de..0492244a 100644 --- a/third-party/tbb/examples/parallel_reduce/convex_hull/CMakeLists.txt +++ b/third-party/tbb/examples/parallel_reduce/convex_hull/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(convex_hull_bench CXX) project(convex_hull_sample CXX) diff --git a/third-party/tbb/examples/parallel_reduce/pi/CMakeLists.txt b/third-party/tbb/examples/parallel_reduce/pi/CMakeLists.txt new file mode 100644 index 00000000..62ebe022 --- /dev/null +++ b/third-party/tbb/examples/parallel_reduce/pi/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.5) + +project(pi CXX) + +include(../../common/cmake/common.cmake) + +set_common_project_settings(tbb) + +add_executable(pi main.cpp pi.cpp) + +target_link_libraries(pi TBB::tbb Threads::Threads) +target_compile_options(pi PRIVATE ${TBB_CXX_STD_FLAG}) + +set(EXECUTABLE "$") +set(ARGS "") +set(PERF_ARGS auto 100000000000) + +add_execution_target(run_pi pi ${EXECUTABLE} "${ARGS}") +add_execution_target(perf_run_pi pi ${EXECUTABLE} "${PERF_ARGS}") diff --git a/third-party/tbb/examples/parallel_reduce/pi/README.md b/third-party/tbb/examples/parallel_reduce/pi/README.md new file mode 100644 index 00000000..be7ce0d4 --- /dev/null +++ b/third-party/tbb/examples/parallel_reduce/pi/README.md @@ -0,0 +1,24 @@ +# Pi Sample +Parallel version of calculating π by numerical integration. + +## Build +To build the sample, run the following commands: +``` +cmake +cmake --build . +``` + +## Run +### Predefined Make Targets +* `make run_pi` - executes the example with predefined parameters +* `make perf_run_pi` - executes the example with suggested parameters to measure the oneTBB performance + +### Application Parameters +You can use the following application parameters: +``` +pi [n-of-threads=value] [n-of-intervals=value] [silent] [-h] [n-of-threads [n-of-intervals]] +``` +* `-h` - prints the help for command-line options. +* `n-of-threads` - the number of threads to use. This number is specified in the low\[:high\] range format, where both ``low`` and, optionally, ``high`` are non-negative integers. You can also use ``auto`` to let the system choose a default number of threads suitable for the platform. +* `n-of-intervals` - the number of intervals to subdivide into. Must be a positive integer. +* `silent` - no output except the elapsed time. diff --git a/third-party/tbb/examples/parallel_reduce/pi/common.h b/third-party/tbb/examples/parallel_reduce/pi/common.h new file mode 100644 index 00000000..0e316854 --- /dev/null +++ b/third-party/tbb/examples/parallel_reduce/pi/common.h @@ -0,0 +1,51 @@ +/* + Copyright (c) 2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef TBB_examples_pi_H +#define TBB_examples_pi_H + +#include + +typedef std::size_t number_t; +typedef double pi_t; + +extern const number_t chunk_size; +extern number_t num_intervals; +extern pi_t step; + +extern bool silent; + +inline pi_t pi_kernel(number_t i) { + pi_t dx = (pi_t(i) + pi_t(0.5)) * step; + return pi_t(4.0) / (pi_t(1.0) + dx * dx); +} + +inline double pi_slice_kernel(number_t slice, number_t slice_size = chunk_size) { + pi_t pi = pi_t(0.0); + for (number_t i = slice; i < slice + slice_size; ++i) { + pi += pi_kernel(i); + } + return pi; +} + +struct threading { + threading(int p); + ~threading(); +}; + +double compute_pi_parallel(); + +#endif // TBB_examples_pi_H diff --git a/third-party/tbb/examples/parallel_reduce/pi/main.cpp b/third-party/tbb/examples/parallel_reduce/pi/main.cpp new file mode 100644 index 00000000..81690617 --- /dev/null +++ b/third-party/tbb/examples/parallel_reduce/pi/main.cpp @@ -0,0 +1,100 @@ +/* + Copyright (c) 2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "oneapi/tbb/tick_count.h" + +#include "common/utility/get_default_num_threads.hpp" +#include "common/utility/utility.hpp" + +#include "common.h" + +const number_t chunk_size = 4096; // Multiple of 16, to fit float datatype to a vector register. + +// number of intervals +number_t num_intervals = 1000000000; +pi_t step = pi_t(0.0); + +bool silent = false; + +double compute_pi_serial() { + double ret = 0; + + step = pi_t(1.0) / num_intervals; + + number_t tail = num_intervals % chunk_size; + number_t last = num_intervals - tail; + + for (number_t slice = 0; slice < last; slice += chunk_size) { + ret += pi_slice_kernel(slice); + } + ret += pi_slice_kernel(last, tail); + ret *= step; + + return ret; +} + +int main(int argc, char* argv[]) { + try { + tbb::tick_count main_start_time = tbb::tick_count::now(); + // zero number of threads means to run serial version + utility::thread_number_range threads(utility::get_default_num_threads, 0); + + utility::parse_cli_arguments( + argc, + argv, + utility::cli_argument_pack() + //"-h" option for for displaying help is present implicitly + .positional_arg(threads, "n-of-threads", utility::thread_number_range_desc) + .positional_arg(num_intervals, "n-of-intervals", "number of intervals") + .arg(silent, "silent", "no output except time elapsed")); + + for (int p = threads.first; p <= threads.last; p = threads.step(p)) { + pi_t pi; + double compute_time; + if (p == 0) { + //run a serial version + tbb::tick_count compute_start_time = tbb::tick_count::now(); + pi = compute_pi_serial(); + compute_time = (tbb::tick_count::now() - compute_start_time).seconds(); + } + else { + //run a parallel version + threading tp(p); + tbb::tick_count compute_start_time = tbb::tick_count::now(); + pi = compute_pi_parallel(); + compute_time = (tbb::tick_count::now() - compute_start_time).seconds(); + } + + if (!silent) { + if (p == 0) { + std::cout << "Serial run:\tpi = " << pi << "\tcompute time = " << compute_time + << " sec\n"; + } + else { + std::cout << "Parallel run:\tpi = " << pi << "\tcompute time = " << compute_time + << " sec\t on " << p << " threads\n"; + } + } + } + + utility::report_elapsed_time((tbb::tick_count::now() - main_start_time).seconds()); + return 0; + } + catch (std::exception& e) { + std::cerr << "error occurred. error text is :\"" << e.what() << "\"\n"; + return 1; + } +} diff --git a/third-party/tbb/examples/parallel_reduce/pi/pi.cpp b/third-party/tbb/examples/parallel_reduce/pi/pi.cpp new file mode 100644 index 00000000..230752a9 --- /dev/null +++ b/third-party/tbb/examples/parallel_reduce/pi/pi.cpp @@ -0,0 +1,55 @@ +/* + Copyright (c) 2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "common.h" +#include "oneapi/tbb/blocked_range.h" +#include "oneapi/tbb/global_control.h" +#include "oneapi/tbb/parallel_reduce.h" + +struct reduce_body { + double my_pi; + reduce_body() : my_pi(0) {} + reduce_body(reduce_body& x, tbb::split) : my_pi(0) {} + void operator()(const tbb::blocked_range& r) { + my_pi += pi_slice_kernel(r.begin(), r.size()); + } + void join(const reduce_body& y) { + my_pi += y.my_pi; + } +}; + +double compute_pi_parallel() { + step = pi_t(1.0) / num_intervals; + + double ret = 0.0; + + reduce_body body; + tbb::parallel_reduce(tbb::blocked_range(0, num_intervals), body); + + ret = body.my_pi * step; + + return ret; +} + +static std::unique_ptr gc; + +threading::threading(int p) { + gc.reset(new tbb::global_control(tbb::global_control::max_allowed_parallelism, p)); +} + +threading::~threading() { + gc.reset(); +} diff --git a/third-party/tbb/examples/parallel_reduce/primes/CMakeLists.txt b/third-party/tbb/examples/parallel_reduce/primes/CMakeLists.txt index dabd9682..987d4656 100644 --- a/third-party/tbb/examples/parallel_reduce/primes/CMakeLists.txt +++ b/third-party/tbb/examples/parallel_reduce/primes/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(primes CXX) diff --git a/third-party/tbb/examples/task_arena/fractal/CMakeLists.txt b/third-party/tbb/examples/task_arena/fractal/CMakeLists.txt index 888428b3..857dae64 100644 --- a/third-party/tbb/examples/task_arena/fractal/CMakeLists.txt +++ b/third-party/tbb/examples/task_arena/fractal/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(fractal CXX) diff --git a/third-party/tbb/examples/task_group/sudoku/CMakeLists.txt b/third-party/tbb/examples/task_group/sudoku/CMakeLists.txt index 5fea9ee6..f514662a 100644 --- a/third-party/tbb/examples/task_group/sudoku/CMakeLists.txt +++ b/third-party/tbb/examples/task_group/sudoku/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(sudoku CXX) diff --git a/third-party/tbb/examples/test_all/fibonacci/CMakeLists.txt b/third-party/tbb/examples/test_all/fibonacci/CMakeLists.txt index 5c97e28a..3b2368e0 100644 --- a/third-party/tbb/examples/test_all/fibonacci/CMakeLists.txt +++ b/third-party/tbb/examples/test_all/fibonacci/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021 Intel Corporation +# Copyright (c) 2019-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.1) +cmake_minimum_required(VERSION 3.5) project(fibonacci CXX) diff --git a/third-party/tbb/include/oneapi/tbb.h b/third-party/tbb/include/oneapi/tbb.h index 3782c74d..ad960113 100644 --- a/third-party/tbb/include/oneapi/tbb.h +++ b/third-party/tbb/include/oneapi/tbb.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -63,6 +63,8 @@ #include "oneapi/tbb/queuing_rw_mutex.h" #include "oneapi/tbb/spin_mutex.h" #include "oneapi/tbb/spin_rw_mutex.h" +#include "oneapi/tbb/mutex.h" +#include "oneapi/tbb/rw_mutex.h" #include "oneapi/tbb/task.h" #include "oneapi/tbb/task_arena.h" #include "oneapi/tbb/task_group.h" diff --git a/third-party/tbb/include/oneapi/tbb/concurrent_queue.h b/third-party/tbb/include/oneapi/tbb/concurrent_queue.h index 1e7ff50b..cfd5db6a 100644 --- a/third-party/tbb/include/oneapi/tbb/concurrent_queue.h +++ b/third-party/tbb/include/oneapi/tbb/concurrent_queue.h @@ -685,7 +685,7 @@ concurrent_bounded_queue( It, It, Alloc = Alloc() ) #endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */ } //namespace d2 -} // namesapce detail +} // namespace detail inline namespace v1 { diff --git a/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h b/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h index ade91c33..40829208 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h @@ -921,7 +921,7 @@ class concurrent_unordered_base { node_allocator_traits::deallocate(dummy_node_allocator, node, 1); } else { // GCC 11.1 issues a warning here that incorrect destructor might be called for dummy_nodes - #if (__TBB_GCC_VERSION >= 110100 && __TBB_GCC_VERSION < 130000 ) && !__clang__ && !__INTEL_COMPILER + #if (__TBB_GCC_VERSION >= 110100 && __TBB_GCC_VERSION < 140000 ) && !__clang__ && !__INTEL_COMPILER volatile #endif value_node_ptr val_node = static_cast(node); diff --git a/third-party/tbb/include/oneapi/tbb/detail/_config.h b/third-party/tbb/include/oneapi/tbb/detail/_config.h index ad9f0f31..d6705e15 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_config.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_config.h @@ -188,7 +188,7 @@ /** __TBB_DYNAMIC_LOAD_ENABLED describes the system possibility to load shared libraries at run time **/ #ifndef __TBB_DYNAMIC_LOAD_ENABLED - #define __TBB_DYNAMIC_LOAD_ENABLED 1 + #define __TBB_DYNAMIC_LOAD_ENABLED (!__EMSCRIPTEN__) #endif /** __TBB_WIN8UI_SUPPORT enables support of Windows* Store Apps and limit a possibility to load @@ -201,7 +201,7 @@ /** __TBB_WEAK_SYMBOLS_PRESENT denotes that the system supports the weak symbol mechanism **/ #ifndef __TBB_WEAK_SYMBOLS_PRESENT - #define __TBB_WEAK_SYMBOLS_PRESENT ( !_WIN32 && !__APPLE__ && !__sun && (__TBB_GCC_VERSION >= 40000 || __INTEL_COMPILER ) ) + #define __TBB_WEAK_SYMBOLS_PRESENT ( !__EMSCRIPTEN__ && !_WIN32 && !__APPLE__ && !__sun && (__TBB_GCC_VERSION >= 40000 || __INTEL_COMPILER ) ) #endif /** Presence of compiler features **/ @@ -380,6 +380,9 @@ #define __TBB_ARENA_BINDING 1 #endif +// Thread pinning is not available on macOS* +#define __TBB_CPUBIND_PRESENT (__TBB_ARENA_BINDING && !__APPLE__) + #ifndef __TBB_ENQUEUE_ENFORCED_CONCURRENCY #define __TBB_ENQUEUE_ENFORCED_CONCURRENCY 1 #endif diff --git a/third-party/tbb/include/oneapi/tbb/detail/_machine.h b/third-party/tbb/include/oneapi/tbb/detail/_machine.h index 7a4a1e31..ca481380 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_machine.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_machine.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2023 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -96,7 +96,7 @@ static inline void machine_pause(int32_t delay) { #if __TBB_x86_64 || __TBB_x86_32 while (delay-- > 0) { _mm_pause(); } #elif __ARM_ARCH_7A__ || __aarch64__ - while (delay-- > 0) { __asm__ __volatile__("yield" ::: "memory"); } + while (delay-- > 0) { __asm__ __volatile__("isb sy" ::: "memory"); } #else /* Generic */ (void)delay; // suppress without including _template_helpers.h yield(); diff --git a/third-party/tbb/include/oneapi/tbb/detail/_template_helpers.h b/third-party/tbb/include/oneapi/tbb/detail/_template_helpers.h index 34913710..50ce3d2d 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_template_helpers.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_template_helpers.h @@ -401,4 +401,3 @@ using type_identity_t = typename type_identity::type; } // namespace tbb #endif // __TBB_detail__template_helpers_H - diff --git a/third-party/tbb/include/oneapi/tbb/detail/_utils.h b/third-party/tbb/include/oneapi/tbb/detail/_utils.h index 1ac2e3ba..1f480702 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_utils.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_utils.h @@ -132,6 +132,12 @@ bool timed_spin_wait_until(Condition condition) { return finish; } +template +T clamp(T value, T lower_bound, T upper_bound) { + __TBB_ASSERT(lower_bound <= upper_bound, "Incorrect bounds"); + return value > lower_bound ? (value > upper_bound ? upper_bound : value) : lower_bound; +} + template std::uintptr_t log2(T in) { __TBB_ASSERT(in > 0, "The logarithm of a non-positive value is undefined."); diff --git a/third-party/tbb/include/oneapi/tbb/detail/_waitable_atomic.h b/third-party/tbb/include/oneapi/tbb/detail/_waitable_atomic.h index fa7280a5..1b18d11e 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_waitable_atomic.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_waitable_atomic.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2021 Intel Corporation + Copyright (c) 2021-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -69,22 +69,6 @@ class waitable_atomic { } } - void wait_until(T expected, std::uintptr_t context, std::memory_order order) { - auto wakeup_condition = [&] { return my_atomic.load(order) == expected; }; - if (!timed_spin_wait_until(wakeup_condition)) { - // We need to use while here, because notify_all() will wake up all threads - // But predicate for them might be false - d1::delegated_function pred(wakeup_condition); - do { - r1::wait_on_address(this, pred, context); - } while (!wakeup_condition()); - } - } - - void notify_relaxed(std::uintptr_t context) { - r1::notify_by_address(this, context); - } - void notify_one_relaxed() { r1::notify_by_address_one(this); } @@ -92,6 +76,8 @@ class waitable_atomic { // TODO: consider adding following interfaces: // store(desired, memory_order) // notify_all_relaxed() + // wait_until(T, std::uintptr_t, std::memory_order) + // notify_relaxed(std::uintptr_t context) private: std::atomic my_atomic{}; diff --git a/third-party/tbb/include/oneapi/tbb/enumerable_thread_specific.h b/third-party/tbb/include/oneapi/tbb/enumerable_thread_specific.h index 34bcab68..caa53fa0 100644 --- a/third-party/tbb/include/oneapi/tbb/enumerable_thread_specific.h +++ b/third-party/tbb/include/oneapi/tbb/enumerable_thread_specific.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -36,7 +36,15 @@ #include "task.h" // for task::suspend_point #if _WIN32 || _WIN64 +#ifndef NOMINMAX +#define NOMINMAX +#define __TBB_DEFINED_NOMINMAX 1 +#endif #include +#if __TBB_DEFINED_NOMINMAX +#undef NOMINMAX +#undef __TBB_DEFINED_NOMINMAX +#endif #else #include #endif diff --git a/third-party/tbb/include/oneapi/tbb/mutex.h b/third-party/tbb/include/oneapi/tbb/mutex.h index a4d2a9a3..169b7a3c 100644 --- a/third-party/tbb/include/oneapi/tbb/mutex.h +++ b/third-party/tbb/include/oneapi/tbb/mutex.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2021 Intel Corporation + Copyright (c) 2021-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -36,9 +36,7 @@ class mutex { }; //! Destructor - ~mutex() { - __TBB_ASSERT(!my_flag.load(std::memory_order_relaxed), "destruction of an acquired mutex"); - } + ~mutex() = default; //! No Copy mutex(const mutex&) = delete; diff --git a/third-party/tbb/include/oneapi/tbb/parallel_for.h b/third-party/tbb/include/oneapi/tbb/parallel_for.h index 91c7c44c..37a26135 100644 --- a/third-party/tbb/include/oneapi/tbb/parallel_for.h +++ b/third-party/tbb/include/oneapi/tbb/parallel_for.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2023 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -319,7 +319,7 @@ void parallel_for_impl(Index first, Index last, Index step, const Function& f, P template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, Index step, const Function& f) { - parallel_for_impl(first, last, step, f, auto_partitioner()); + parallel_for_impl(first, last, step, f, __TBB_DEFAULT_PARTITIONER()); } //! Parallel iteration over a range of integers with a step provided and simple partitioner template @@ -350,7 +350,7 @@ void parallel_for(Index first, Index last, Index step, const Function& f, affini template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, const Function& f) { - parallel_for_impl(first, last, static_cast(1), f, auto_partitioner()); + parallel_for_impl(first, last, static_cast(1), f, __TBB_DEFAULT_PARTITIONER()); } //! Parallel iteration over a range of integers with a default step value and simple partitioner template @@ -395,7 +395,7 @@ void parallel_for_impl(Index first, Index last, Index step, const Function& f, P template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, Index step, const Function& f, task_group_context &context) { - parallel_for_impl(first, last, step, f, auto_partitioner(), context); + parallel_for_impl(first, last, step, f, __TBB_DEFAULT_PARTITIONER(), context); } //! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner template @@ -426,7 +426,7 @@ void parallel_for(Index first, Index last, Index step, const Function& f, affini template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, const Function& f, task_group_context &context) { - parallel_for_impl(first, last, static_cast(1), f, auto_partitioner(), context); + parallel_for_impl(first, last, static_cast(1), f, __TBB_DEFAULT_PARTITIONER(), context); } //! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner template diff --git a/third-party/tbb/include/oneapi/tbb/parallel_for_each.h b/third-party/tbb/include/oneapi/tbb/parallel_for_each.h index 56dbeb41..ab0b3453 100644 --- a/third-party/tbb/include/oneapi/tbb/parallel_for_each.h +++ b/third-party/tbb/include/oneapi/tbb/parallel_for_each.h @@ -407,6 +407,34 @@ class parallel_for_body_wrapper { template using tag = typename std::iterator_traits::iterator_category; +#if __TBB_CPP20_PRESENT +template +struct move_iterator_dispatch_helper { + using type = It; +}; + +// Until C++23, std::move_iterator::iterator_concept always defines +// to std::input_iterator_tag and hence std::forward_iterator concept +// always evaluates to false, so std::move_iterator dispatch should be +// made according to the base iterator type. +template +struct move_iterator_dispatch_helper> { + using type = It; +}; + +template +using iterator_tag_dispatch_impl = + std::conditional_t, + std::random_access_iterator_tag, + std::conditional_t, + std::forward_iterator_tag, + std::input_iterator_tag>>; + +template +using iterator_tag_dispatch = + iterator_tag_dispatch_impl::type>; + +#else template using iterator_tag_dispatch = typename std::conditional< @@ -418,6 +446,7 @@ using iterator_tag_dispatch = typename std::input_iterator_tag >::type >::type; +#endif // __TBB_CPP20_PRESENT template using feeder_is_required = tbb::detail::void_t(), diff --git a/third-party/tbb/include/oneapi/tbb/parallel_invoke.h b/third-party/tbb/include/oneapi/tbb/parallel_invoke.h index 6eb0f2e5..4bc5d853 100644 --- a/third-party/tbb/include/oneapi/tbb/parallel_invoke.h +++ b/third-party/tbb/include/oneapi/tbb/parallel_invoke.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -59,7 +59,7 @@ struct function_invoker : public task { }; // struct function_invoker //! Task object for managing subroots in trinary task trees. -// Endowed with additional synchronization logic (compatible with wait object intefaces) to support +// Endowed with additional synchronization logic (compatible with wait object interfaces) to support // continuation passing execution. This task spawns 2 function_invoker tasks with first and second functors // and then executes first functor by itself. But only the last executed functor must destruct and deallocate // the subroot task. diff --git a/third-party/tbb/include/oneapi/tbb/parallel_reduce.h b/third-party/tbb/include/oneapi/tbb/parallel_reduce.h index 401ad004..205c97ef 100644 --- a/third-party/tbb/include/oneapi/tbb/parallel_reduce.h +++ b/third-party/tbb/include/oneapi/tbb/parallel_reduce.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2023 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -42,16 +42,16 @@ concept parallel_reduce_body = splittable && template concept parallel_reduce_function = std::invocable&, - const Range&, const Value&> && + const Range&, Value&&> && std::convertible_to&, - const Range&, const Value&>, + const Range&, Value&&>, Value>; template concept parallel_reduce_combine = std::invocable&, - const Value&, const Value&> && + Value&&, Value&&> && std::convertible_to&, - const Value&, const Value&>, + Value&&, Value&&>, Value>; } // namespace d0 @@ -390,14 +390,15 @@ class lambda_reduce_body { , my_value(other.my_identity_element) { } void operator()(Range& range) { - my_value = tbb::detail::invoke(my_real_body, range, const_cast(my_value)); + my_value = tbb::detail::invoke(my_real_body, range, std::move(my_value)); } + void join( lambda_reduce_body& rhs ) { - my_value = tbb::detail::invoke(my_reduction, const_cast(my_value), - const_cast(rhs.my_value)); + my_value = tbb::detail::invoke(my_reduction, std::move(my_value), std::move(rhs.my_value)); } - Value result() const { - return my_value; + + __TBB_nodiscard Value&& result() && noexcept { + return std::move(my_value); } }; @@ -514,7 +515,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody lambda_reduce_body body(identity, real_body, reduction); start_reduce,const __TBB_DEFAULT_PARTITIONER> ::run(range, body, __TBB_DEFAULT_PARTITIONER() ); - return body.result(); + return std::move(body).result(); } //! Parallel iteration with reduction and simple_partitioner. @@ -527,7 +528,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody lambda_reduce_body body(identity, real_body, reduction); start_reduce,const simple_partitioner> ::run(range, body, partitioner ); - return body.result(); + return std::move(body).result(); } //! Parallel iteration with reduction and auto_partitioner @@ -540,7 +541,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody lambda_reduce_body body(identity, real_body, reduction); start_reduce,const auto_partitioner> ::run( range, body, partitioner ); - return body.result(); + return std::move(body).result(); } //! Parallel iteration with reduction and static_partitioner @@ -553,7 +554,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody lambda_reduce_body body(identity, real_body, reduction); start_reduce,const static_partitioner> ::run( range, body, partitioner ); - return body.result(); + return std::move(body).result(); } //! Parallel iteration with reduction and affinity_partitioner @@ -566,7 +567,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody lambda_reduce_body body(identity, real_body, reduction); start_reduce,affinity_partitioner> ::run( range, body, partitioner ); - return body.result(); + return std::move(body).result(); } //! Parallel iteration with reduction, default partitioner and user-supplied context. @@ -579,7 +580,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody lambda_reduce_body body(identity, real_body, reduction); start_reduce,const __TBB_DEFAULT_PARTITIONER> ::run( range, body, __TBB_DEFAULT_PARTITIONER(), context ); - return body.result(); + return std::move(body).result(); } //! Parallel iteration with reduction, simple partitioner and user-supplied context. @@ -592,7 +593,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody lambda_reduce_body body(identity, real_body, reduction); start_reduce,const simple_partitioner> ::run( range, body, partitioner, context ); - return body.result(); + return std::move(body).result(); } //! Parallel iteration with reduction, auto_partitioner and user-supplied context @@ -605,7 +606,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody lambda_reduce_body body(identity, real_body, reduction); start_reduce,const auto_partitioner> ::run( range, body, partitioner, context ); - return body.result(); + return std::move(body).result(); } //! Parallel iteration with reduction, static_partitioner and user-supplied context @@ -618,7 +619,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody lambda_reduce_body body(identity, real_body, reduction); start_reduce,const static_partitioner> ::run( range, body, partitioner, context ); - return body.result(); + return std::move(body).result(); } //! Parallel iteration with reduction, affinity_partitioner and user-supplied context @@ -631,7 +632,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody lambda_reduce_body body(identity, real_body, reduction); start_reduce,affinity_partitioner> ::run( range, body, partitioner, context ); - return body.result(); + return std::move(body).result(); } //! Parallel iteration with deterministic reduction and default simple partitioner. @@ -704,7 +705,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity, lambda_reduce_body body(identity, real_body, reduction); start_deterministic_reduce, const simple_partitioner> ::run(range, body, partitioner); - return body.result(); + return std::move(body).result(); } //! Parallel iteration with deterministic reduction and static partitioner. @@ -716,7 +717,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity, lambda_reduce_body body(identity, real_body, reduction); start_deterministic_reduce, const static_partitioner> ::run(range, body, partitioner); - return body.result(); + return std::move(body).result(); } //! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context. @@ -739,7 +740,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity, lambda_reduce_body body(identity, real_body, reduction); start_deterministic_reduce, const simple_partitioner> ::run(range, body, partitioner, context); - return body.result(); + return std::move(body).result(); } //! Parallel iteration with deterministic reduction, static partitioner and user-supplied context. @@ -752,7 +753,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity, lambda_reduce_body body(identity, real_body, reduction); start_deterministic_reduce, const static_partitioner> ::run(range, body, partitioner, context); - return body.result(); + return std::move(body).result(); } //@} diff --git a/third-party/tbb/include/oneapi/tbb/parallel_scan.h b/third-party/tbb/include/oneapi/tbb/parallel_scan.h index 6d2a4d64..d624f7eb 100644 --- a/third-party/tbb/include/oneapi/tbb/parallel_scan.h +++ b/third-party/tbb/include/oneapi/tbb/parallel_scan.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2023 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -562,7 +562,7 @@ class lambda_scan_body { template __TBB_requires(tbb_range && parallel_scan_body) void parallel_scan( const Range& range, Body& body ) { - start_scan::run(range,body,__TBB_DEFAULT_PARTITIONER()); + start_scan::run(range,body,__TBB_DEFAULT_PARTITIONER()); } //! Parallel prefix with simple_partitioner diff --git a/third-party/tbb/include/oneapi/tbb/partitioner.h b/third-party/tbb/include/oneapi/tbb/partitioner.h index 98de0d42..f09786c0 100644 --- a/third-party/tbb/include/oneapi/tbb/partitioner.h +++ b/third-party/tbb/include/oneapi/tbb/partitioner.h @@ -340,7 +340,7 @@ struct proportional_mode : adaptive_mode { // Create the proportion from partitioner internal resources (threads) that would be used: // - into proportional_mode constructor to split the partitioner // - if Range supports the proportional_split constructor it would use proposed proportion, - // otherwise, the tbb::proportional_split object will be implicitly (for Range implementor) + // otherwise, the tbb::proportional_split object will be implicitly (for Range implementer) // casted to tbb::split std::size_t n = self().my_divisor / my_partition::factor; diff --git a/third-party/tbb/include/oneapi/tbb/scalable_allocator.h b/third-party/tbb/include/oneapi/tbb/scalable_allocator.h index 36a9da49..31650a0a 100644 --- a/third-party/tbb/include/oneapi/tbb/scalable_allocator.h +++ b/third-party/tbb/include/oneapi/tbb/scalable_allocator.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #ifdef __cplusplus #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/detail/_utils.h" +#include "oneapi/tbb/detail/_namespace_injection.h" #include #include #include /* std::bad_alloc() */ diff --git a/third-party/tbb/include/oneapi/tbb/task_arena.h b/third-party/tbb/include/oneapi/tbb/task_arena.h index 0de49aef..5ce41d99 100644 --- a/third-party/tbb/include/oneapi/tbb/task_arena.h +++ b/third-party/tbb/include/oneapi/tbb/task_arena.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -110,7 +110,8 @@ inline void enqueue_impl(task_handle&& th, d1::task_arena_base* ta) { namespace d1 { -static constexpr int priority_stride = INT_MAX / 4; +static constexpr unsigned num_priority_levels = 3; +static constexpr int priority_stride = INT_MAX / (num_priority_levels + 1); class task_arena_base { friend struct r1::task_arena_impl; diff --git a/third-party/tbb/include/oneapi/tbb/task_group.h b/third-party/tbb/include/oneapi/tbb/task_group.h index 2bbacd55..04e241f6 100644 --- a/third-party/tbb/include/oneapi/tbb/task_group.h +++ b/third-party/tbb/include/oneapi/tbb/task_group.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -51,7 +51,7 @@ class task_group_base; namespace r1 { // Forward declarations class tbb_exception_ptr; -class market; +class cancellation_disseminator; class thread_data; class task_dispatcher; template @@ -407,7 +407,7 @@ class task_group_context : no_copy { } private: //// TODO: cleanup friends - friend class r1::market; + friend class r1::cancellation_disseminator; friend class r1::thread_data; friend class r1::task_dispatcher; template diff --git a/third-party/tbb/include/oneapi/tbb/version.h b/third-party/tbb/include/oneapi/tbb/version.h index 965af129..fff3e7e2 100644 --- a/third-party/tbb/include/oneapi/tbb/version.h +++ b/third-party/tbb/include/oneapi/tbb/version.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2023 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -29,18 +29,22 @@ // Product version #define TBB_VERSION_MAJOR 2021 // Update version -#define TBB_VERSION_MINOR 10 +#define TBB_VERSION_MINOR 13 // "Patch" version for custom releases #define TBB_VERSION_PATCH 0 // Suffix string #define __TBB_VERSION_SUFFIX "" // Full official version string -#define TBB_VERSION_STRING __TBB_STRING(TBB_VERSION_MAJOR) "." __TBB_STRING(TBB_VERSION_MINOR) __TBB_VERSION_SUFFIX +#define TBB_VERSION_STRING \ + __TBB_STRING(TBB_VERSION_MAJOR) "." \ + __TBB_STRING(TBB_VERSION_MINOR) "." \ + __TBB_STRING(TBB_VERSION_PATCH) \ + __TBB_VERSION_SUFFIX // OneAPI oneTBB specification version #define ONETBB_SPEC_VERSION "1.0" // Full interface version -#define TBB_INTERFACE_VERSION 12100 +#define TBB_INTERFACE_VERSION 12130 // Major interface version #define TBB_INTERFACE_VERSION_MAJOR (TBB_INTERFACE_VERSION/1000) // Minor interface version @@ -51,37 +55,37 @@ #define __TBB_BINARY_VERSION 12 //! TBB_VERSION support -#ifndef ENDL -#define ENDL "\n" +#ifndef TBB_ENDL +#define TBB_ENDL "\n" #endif //TBB_REVAMP_TODO: consider enabling version_string.ver generation //TBB_REVAMP_TODO: #include "version_string.ver" -#define __TBB_ONETBB_SPEC_VERSION(N) #N ": SPECIFICATION VERSION\t" ONETBB_SPEC_VERSION ENDL -#define __TBB_VERSION_NUMBER(N) #N ": VERSION\t\t" TBB_VERSION_STRING ENDL -#define __TBB_INTERFACE_VERSION_NUMBER(N) #N ": INTERFACE VERSION\t" __TBB_STRING(TBB_INTERFACE_VERSION) ENDL +#define __TBB_ONETBB_SPEC_VERSION(N) #N ": SPECIFICATION VERSION\t" ONETBB_SPEC_VERSION TBB_ENDL +#define __TBB_VERSION_NUMBER(N) #N ": VERSION\t\t" TBB_VERSION_STRING TBB_ENDL +#define __TBB_INTERFACE_VERSION_NUMBER(N) #N ": INTERFACE VERSION\t" __TBB_STRING(TBB_INTERFACE_VERSION) TBB_ENDL #ifndef TBB_USE_DEBUG - #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\tundefined" ENDL + #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\tundefined" TBB_ENDL #elif TBB_USE_DEBUG==0 - #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t0" ENDL + #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t0" TBB_ENDL #elif TBB_USE_DEBUG==1 - #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t1" ENDL + #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t1" TBB_ENDL #elif TBB_USE_DEBUG==2 - #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t2" ENDL + #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t2" TBB_ENDL #else #error Unexpected value for TBB_USE_DEBUG #endif #ifndef TBB_USE_ASSERT - #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\tundefined" ENDL + #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\tundefined" TBB_ENDL #elif TBB_USE_ASSERT==0 - #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t0" ENDL + #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t0" TBB_ENDL #elif TBB_USE_ASSERT==1 - #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t1" ENDL + #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t1" TBB_ENDL #elif TBB_USE_ASSERT==2 - #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t2" ENDL + #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t2" TBB_ENDL #else #error Unexpected value for TBB_USE_ASSERT #endif diff --git a/third-party/tbb/include/tbb/mutex.h b/third-party/tbb/include/tbb/mutex.h new file mode 100644 index 00000000..91dbee0f --- /dev/null +++ b/third-party/tbb/include/tbb/mutex.h @@ -0,0 +1,17 @@ +/* + Copyright (c) 2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "../oneapi/tbb/mutex.h" diff --git a/third-party/tbb/include/tbb/rw_mutex.h b/third-party/tbb/include/tbb/rw_mutex.h new file mode 100644 index 00000000..f2499eba --- /dev/null +++ b/third-party/tbb/include/tbb/rw_mutex.h @@ -0,0 +1,17 @@ +/* + Copyright (c) 2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "../oneapi/tbb/rw_mutex.h" diff --git a/third-party/tbb/integration/linux/env/vars.sh b/third-party/tbb/integration/linux/env/vars.sh index 151b298d..5d913bb6 100644 --- a/third-party/tbb/integration/linux/env/vars.sh +++ b/third-party/tbb/integration/linux/env/vars.sh @@ -1,7 +1,7 @@ #!/bin/sh # shellcheck shell=sh # -# Copyright (c) 2005-2021 Intel Corporation +# Copyright (c) 2005-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# The script is setting up environment for TBB. +# The script is setting up environment for oneTBB. # Supported arguments: # intel64|ia32 - architecture, intel64 is default. @@ -139,6 +139,7 @@ fi TBBROOT=$(get_script_path "${vars_script_name:-}")/.. TBB_TARGET_ARCH="intel64" +TBB_ARCH_SUFFIX="" if [ -n "${SETVARS_ARGS:-}" ]; then tbb_arg_ia32="$(expr "${SETVARS_ARGS:-}" : '^.*\(ia32\)')" || true @@ -157,17 +158,26 @@ else fi TBB_LIB_NAME="libtbb.so.12" -TBB_LIB_DIR="$TBB_TARGET_ARCH/gcc4.8" -if [ -e "$TBBROOT/lib/$TBB_LIB_DIR/$TBB_LIB_NAME" ]; then +# Parse layout +if [ -e "$TBBROOT/lib/$TBB_TARGET_ARCH" ]; then + TBB_LIB_DIR="$TBB_TARGET_ARCH/gcc4.8" +else + if [ "$TBB_TARGET_ARCH" = "ia32" ] ; then + TBB_ARCH_SUFFIX="32" + fi + TBB_LIB_DIR="" +fi + +if [ -e "$TBBROOT/lib$TBB_ARCH_SUFFIX/$TBB_LIB_DIR/$TBB_LIB_NAME" ]; then export TBBROOT - LIBRARY_PATH=$(prepend_path "${TBBROOT}/lib/$TBB_LIB_DIR" "${LIBRARY_PATH:-}") ; export LIBRARY_PATH - LD_LIBRARY_PATH=$(prepend_path "${TBBROOT}/lib/$TBB_LIB_DIR" "${LD_LIBRARY_PATH:-}") ; export LD_LIBRARY_PATH + LIBRARY_PATH=$(prepend_path "${TBBROOT}/lib$TBB_ARCH_SUFFIX/$TBB_LIB_DIR" "${LIBRARY_PATH:-}") ; export LIBRARY_PATH + LD_LIBRARY_PATH=$(prepend_path "${TBBROOT}/lib$TBB_ARCH_SUFFIX/$TBB_LIB_DIR" "${LD_LIBRARY_PATH:-}") ; export LD_LIBRARY_PATH CPATH=$(prepend_path "${TBBROOT}/include" "${CPATH:-}") ; export CPATH CMAKE_PREFIX_PATH=$(prepend_path "${TBBROOT}" "${CMAKE_PREFIX_PATH:-}") ; export CMAKE_PREFIX_PATH - PKG_CONFIG_PATH=$(prepend_path "${TBBROOT}/lib/pkgconfig" "${PKG_CONFIG_PATH:-}") ; export PKG_CONFIG_PATH + PKG_CONFIG_PATH=$(prepend_path "${TBBROOT}/lib$TBB_ARCH_SUFFIX/pkgconfig" "${PKG_CONFIG_PATH:-}") ; export PKG_CONFIG_PATH else - >&2 echo "ERROR: $TBB_LIB_NAME library does not exist in $TBBROOT/lib/$TBB_LIB_DIR." + >&2 echo "ERROR: $TBB_LIB_NAME library does not exist in $TBBROOT/lib$TBB_ARCH_SUFFIX/$TBB_LIB_DIR." return 255 2>/dev/null || exit 255 fi diff --git a/third-party/tbb/integration/linux/modulefiles/tbb b/third-party/tbb/integration/linux/modulefiles/tbb index b5eeb320..b8c695ed 100644 --- a/third-party/tbb/integration/linux/modulefiles/tbb +++ b/third-party/tbb/integration/linux/modulefiles/tbb @@ -1,6 +1,6 @@ #%Module1.0################################################################### # -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,17 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Why all the directory and filename boilerplate code? It is needed in order -# to properly remove symbolic links used in assembly of the modulefiles -# folder as well as those found within the oneAPI installation folders. -# Without it many modulefiles will fail to work as expected. - -# IMPORTANT: quotes around "$variables" and "[expressions]" are there -# to insure that paths/filenames which include spaces are handled properly. - # This modulefile requires Environment Modules 4.1 or later. # Type `module --version` to determine the current installed version. +############################################################################## + set min_tcl_ver 8.4 if { $tcl_version < $min_tcl_ver } { puts stderr " " @@ -33,72 +27,43 @@ if { $tcl_version < $min_tcl_ver } { exit 1 } -# get full pathname for this script file +# if modulefile script name is a symlink, resolve it to get the fully +# qualified pathname that points to the actual modulefile script +# see: https://wiki.tcl-lang.org/page/file+normalize set scriptpath "${ModulesCurrentModulefile}" - -# if modulefile script name is a symlink, resolve it -if { "[file type "$scriptpath"]" eq "link" } { - set scriptpath "[file readlink "$scriptpath"]" +set scriptpath "[file dirname [file normalize "$scriptpath/___"]]" + +# define componentroot, modulefilepath, modulefilename and modulefilever +set modulefilename "[file tail [file dirname "${scriptpath}"]]" +set modulefilever "[file tail "${scriptpath}"]" +set modulefilepath "${scriptpath}" +set componentroot "[file dirname [file dirname [file dirname [file dirname "${scriptpath}"]]]]" + +############################################################################## + +module-whatis "Name: Intel(R) oneAPI Threading Building Blocks" +module-whatis "Version: $modulefilename/$modulefilever" +module-whatis "Description: Flexible threading library for adding parallelism to complex applications across accelerated architectures." +module-whatis "URL: https://www.intel.com/content/www/us/en/developer/tools/oneapi/onetbb.html" +module-whatis "Dependencies: none" + +proc ModulesHelp { } { + global modulefilename + global modulefilever + module whatis "${modulefilename}/${modulefilever}" } -# if fullpath contains links, resolve them -set scriptpath "[file normalize "$scriptpath"]" - -# get directory holding this modulefile script and others -set modulefileroot "[file dirname "$scriptpath"]" +############################################################################## -# get name of modulefile script we are loading -set modulefilename "[file tail "$scriptpath"]" +# Define environment variables needed for an isolated component install. -# determine modulefile script version -set modulefilever "[file dirname "$modulefileroot"]" -set modulefilever "[file tail "$modulefilever"]" - -# point to component top-level root folder -set componentroot "[file dirname "$modulefileroot"]" -set componentroot "[file dirname "$componentroot"]" - -# get component folder name -set componentname "[file tail "$componentroot"]" - -# get oneAPI top-level root folder -set oneapiroot "[file dirname "$componentroot"]" - -# disallow loading multiple versions of this modulefile -# disallow loading multiple architectures of this modulefile -# if only 64-bit architecture exists the test still works -set mname32 $modulefilename -set mname64 [string trimright $mname32 "32"] -if { [string equal "$mname32" "$mname64"] } { - append mname32 "32" -} -conflict $mname32 -conflict $mname64 - - -# On load print component name and version being loaded -if { [ module-info mode load ] } { - puts stderr "Loading $modulefilename version $modulefilever" -} - -# On `module unload` print component module name and version being removed -# Include `module list` message only if this modulefile loads dependent modules -if { [ module-info mode ] == "unload" || [ module-info mode ] == "remove" } { - puts stderr "Removing $modulefilename version $modulefilever" - puts stderr "Use `module list` to view any remaining dependent modules." -} - - -# ###### Component Specific env vars setup ################################### - -set tbbroot "$componentroot/$modulefilever" +set tbbroot "$componentroot" set tbb_target_arch "intel64" -module-whatis "Intel(R) oneAPI Threading Building Blocks for $tbb_target_arch." - setenv TBBROOT "$tbbroot" prepend-path CPATH "$tbbroot/include" -prepend-path LIBRARY_PATH "$tbbroot/lib/$tbb_target_arch/gcc4.8" -prepend-path LD_LIBRARY_PATH "$tbbroot/lib/$tbb_target_arch/gcc4.8" +prepend-path LIBRARY_PATH "$tbbroot/lib" +prepend-path LD_LIBRARY_PATH "$tbbroot/lib" prepend-path CMAKE_PREFIX_PATH "$tbbroot" +prepend-path PKG_CONFIG_PATH "$tbbroot/lib/pkgconfig" diff --git a/third-party/tbb/integration/linux/modulefiles/tbb32 b/third-party/tbb/integration/linux/modulefiles/tbb32 index ec8ff874..db341351 100644 --- a/third-party/tbb/integration/linux/modulefiles/tbb32 +++ b/third-party/tbb/integration/linux/modulefiles/tbb32 @@ -1,6 +1,6 @@ #%Module1.0################################################################### # -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,17 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Why all the directory and filename boilerplate code? It is needed in order -# to properly remove symbolic links used in assembly of the modulefiles -# folder as well as those found within the oneAPI installation folders. -# Without it many modulefiles will fail to work as expected. - -# IMPORTANT: quotes around "$variables" and "[expressions]" are there -# to insure that paths/filenames which include spaces are handled properly. - # This modulefile requires Environment Modules 4.1 or later. # Type `module --version` to determine the current installed version. +############################################################################## + set min_tcl_ver 8.4 if { $tcl_version < $min_tcl_ver } { puts stderr " " @@ -33,72 +27,43 @@ if { $tcl_version < $min_tcl_ver } { exit 1 } -# get full pathname for this script file +# if modulefile script name is a symlink, resolve it to get the fully +# qualified pathname that points to the actual modulefile script +# see: https://wiki.tcl-lang.org/page/file+normalize set scriptpath "${ModulesCurrentModulefile}" - -# if modulefile script name is a symlink, resolve it -if { "[file type "$scriptpath"]" eq "link" } { - set scriptpath "[file readlink "$scriptpath"]" +set scriptpath "[file dirname [file normalize "$scriptpath/___"]]" + +# define componentroot, modulefilepath, modulefilename and modulefilever +set modulefilename "[file tail [file dirname "${scriptpath}"]]" +set modulefilever "[file tail "${scriptpath}"]" +set modulefilepath "${scriptpath}" +set componentroot "[file dirname [file dirname [file dirname [file dirname "${scriptpath}"]]]]" + +############################################################################## + +module-whatis "Name: Intel(R) oneAPI Threading Building Blocks" +module-whatis "Version: $modulefilename/$modulefilever" +module-whatis "Description: Flexible threading library for adding parallelism to complex applications across accelerated architectures." +module-whatis "URL: https://www.intel.com/content/www/us/en/developer/tools/oneapi/onetbb.html" +module-whatis "Dependencies: none" + +proc ModulesHelp { } { + global modulefilename + global modulefilever + module whatis "${modulefilename}/${modulefilever}" } -# if fullpath contains links, resolve them -set scriptpath "[file normalize "$scriptpath"]" - -# get directory holding this modulefile script and others -set modulefileroot "[file dirname "$scriptpath"]" +############################################################################## -# get name of modulefile script we are loading -set modulefilename "[file tail "$scriptpath"]" +# Define environment variables needed for an isolated component install. -# determine modulefile script version -set modulefilever "[file dirname "$modulefileroot"]" -set modulefilever "[file tail "$modulefilever"]" - -# point to component top-level root folder -set componentroot "[file dirname "$modulefileroot"]" -set componentroot "[file dirname "$componentroot"]" - -# get component folder name -set componentname "[file tail "$componentroot"]" - -# get oneAPI top-level root folder -set oneapiroot "[file dirname "$componentroot"]" - -# disallow loading multiple versions of this modulefile -# disallow loading multiple architectures of this modulefile -# if only 64-bit architecture exists the test still works -set mname32 $modulefilename -set mname64 [string trimright $mname32 "32"] -if { [string equal "$mname32" "$mname64"] } { - append mname32 "32" -} -conflict $mname32 -conflict $mname64 - - -# On load print component name and version being loaded -if { [ module-info mode load ] } { - puts stderr "Loading $modulefilename version $modulefilever" -} - -# On `module unload` print component module name and version being removed -# Include `module list` message only if this modulefile loads dependent modules -if { [ module-info mode ] == "unload" || [ module-info mode ] == "remove" } { - puts stderr "Removing $modulefilename version $modulefilever" - puts stderr "Use `module list` to view any remaining dependent modules." -} - - -# ###### Component Specific env vars setup ################################### - -set tbbroot "$componentroot/$modulefilever" +set tbbroot "$componentroot" set tbb_target_arch "ia32" -module-whatis "Intel(R) oneAPI Threading Building Blocks for $tbb_target_arch." - setenv TBBROOT "$tbbroot" -prepend-path CPATH "$tbbroot/include" -prepend-path LIBRARY_PATH "$tbbroot/lib/$tbb_target_arch/gcc4.8" -prepend-path LD_LIBRARY_PATH "$tbbroot/lib/$tbb_target_arch/gcc4.8" +prepend-path CPATH "$tbbroot/include32:$tbbroot/include" +prepend-path LIBRARY_PATH "$tbbroot/lib32" +prepend-path LD_LIBRARY_PATH "$tbbroot/lib32" prepend-path CMAKE_PREFIX_PATH "$tbbroot" +prepend-path PKG_CONFIG_PATH "$tbbroot/lib32/pkgconfig" diff --git a/third-party/tbb/integration/linux/oneapi/vars.sh b/third-party/tbb/integration/linux/oneapi/vars.sh new file mode 100644 index 00000000..ffcf56a5 --- /dev/null +++ b/third-party/tbb/integration/linux/oneapi/vars.sh @@ -0,0 +1,34 @@ +#!/bin/sh +# shellcheck shell=sh +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ -z "${SETVARS_CALL:-}" ] ; then + >&2 echo " " + >&2 echo ":: ERROR: This script must be sourced by setvars.sh." + >&2 echo " Try 'source /setvars.sh --help' for help." + >&2 echo " " + return 255 +fi + +if [ -z "${ONEAPI_ROOT:-}" ] ; then + >&2 echo " " + >&2 echo ":: ERROR: This script requires that the ONEAPI_ROOT env variable is set." + >&2 echo " Try 'source \setvars.sh --help' for help." + >&2 echo " " + return 254 +fi + +TBBROOT="${ONEAPI_ROOT}"; export TBBROOT diff --git a/third-party/tbb/integration/windows/env/vars.bat b/third-party/tbb/integration/windows/env/vars.bat index 78d99301..c5ec0ddc 100644 --- a/third-party/tbb/integration/windows/env/vars.bat +++ b/third-party/tbb/integration/windows/env/vars.bat @@ -24,41 +24,70 @@ REM if ^ is not set Intel(R) 64 architecture will be used REM ^ should be one of the following REM vs2019 : Set to use with Microsoft Visual Studio 2019 runtime DLLs REM vs2022 : Set to use with Microsoft Visual Studio 2022 runtime DLLs -REM all : Set to use TBB statically linked with Microsoft Visual C++ runtime -REM if ^ is not set TBB statically linked with Microsoft Visual C++ runtime will be used. +REM all : Set to use oneTBB statically linked with Microsoft Visual C++ runtime +REM if ^ is not set oneTBB dynamically linked with Microsoft Visual C++ runtime will be used. set "SCRIPT_NAME=%~nx0" -set "TBB_BIN_DIR=%~d0%~p0" -set "TBBROOT=%TBB_BIN_DIR%.." +set "TBB_SCRIPT_DIR=%~d0%~p0" +set "TBBROOT=%TBB_SCRIPT_DIR%.." :: Set the default arguments set TBB_TARGET_ARCH=intel64 -set TBB_TARGET_VS=vc_mt +set TBB_ARCH_SUFFIX= +set TBB_TARGET_VS=vc14 :ParseArgs :: Parse the incoming arguments -if /i "%1"=="" goto SetEnv +if /i "%1"=="" goto ParseLayout if /i "%1"=="ia32" (set TBB_TARGET_ARCH=ia32) & shift & goto ParseArgs if /i "%1"=="intel64" (set TBB_TARGET_ARCH=intel64) & shift & goto ParseArgs if /i "%1"=="vs2019" (set TBB_TARGET_VS=vc14) & shift & goto ParseArgs if /i "%1"=="vs2022" (set TBB_TARGET_VS=vc14) & shift & goto ParseArgs if /i "%1"=="all" (set TBB_TARGET_VS=vc_mt) & shift & goto ParseArgs -:SetEnv -if exist "%TBBROOT%\redist\%TBB_TARGET_ARCH%\%TBB_TARGET_VS%\tbb12.dll" ( - set "TBB_DLL_PATH=%TBBROOT%\redist\%TBB_TARGET_ARCH%\%TBB_TARGET_VS%" +:ParseLayout +if exist "%TBBROOT%\redist\" ( + set "TBB_BIN_DIR=%TBBROOT%\redist" + set "TBB_SUBDIR=%TBB_TARGET_ARCH%" + goto SetEnv +) + +if "%TBB_TARGET_ARCH%" == "ia32" ( + set TBB_ARCH_SUFFIX=32 +) +if exist "%TBBROOT%\bin%TBB_ARCH_SUFFIX%" ( + set "TBB_BIN_DIR=%TBBROOT%\bin%TBB_ARCH_SUFFIX%" + if "%TBB_TARGET_VS%" == "vc14" ( + set TBB_TARGET_VS= + ) + goto SetEnv ) -if exist "%TBBROOT%\..\redist\%TBB_TARGET_ARCH%\tbb\%TBB_TARGET_VS%\tbb12.dll" ( - set "TBB_DLL_PATH=%TBBROOT%\..\redist\%TBB_TARGET_ARCH%\tbb\%TBB_TARGET_VS%" +:: Couldn't parse TBBROOT/bin, unset variable +set TBB_ARCH_SUFFIX= + +if exist "%TBBROOT%\..\redist\" ( + set "TBB_BIN_DIR=%TBBROOT%\..\redist" + set "TBB_SUBDIR=%TBB_TARGET_ARCH%\tbb" + goto SetEnv +) + +:SetEnv +if exist "%TBB_BIN_DIR%\%TBB_SUBDIR%\%TBB_TARGET_VS%\tbb12.dll" ( + set "TBB_DLL_PATH=%TBB_BIN_DIR%\%TBB_SUBDIR%\%TBB_TARGET_VS%" +) else ( + echo: + echo :: ERROR: tbb12.dll library does not exist in "%TBB_BIN_DIR%\%TBB_SUBDIR%\%TBB_TARGET_VS%\" + echo: + exit /b 255 ) set "PATH=%TBB_DLL_PATH%;%PATH%" -set "LIB=%TBBROOT%\lib\%TBB_TARGET_ARCH%\%TBB_TARGET_VS%;%LIB%" +set "LIB=%TBBROOT%\lib%TBB_ARCH_SUFFIX%\%TBB_SUBDIR%\%TBB_TARGET_VS%;%LIB%" set "INCLUDE=%TBBROOT%\include;%INCLUDE%" set "CPATH=%TBBROOT%\include;%CPATH%" set "CMAKE_PREFIX_PATH=%TBBROOT%;%CMAKE_PREFIX_PATH%" -set "PKG_CONFIG_PATH=%TBBROOT%\lib\pkgconfig;%PKG_CONFIG_PATH%" +set "PKG_CONFIG_PATH=%TBBROOT%\lib%TBB_ARCH_SUFFIX%\pkgconfig;%PKG_CONFIG_PATH%" :End exit /B 0 diff --git a/third-party/tbb/integration/windows/nuget/inteltbb.devel.win.targets b/third-party/tbb/integration/windows/nuget/inteltbb.devel.win.targets index ab1f244f..1c94a12c 100644 --- a/third-party/tbb/integration/windows/nuget/inteltbb.devel.win.targets +++ b/third-party/tbb/integration/windows/nuget/inteltbb.devel.win.targets @@ -1,6 +1,6 @@ - $(MSBuildThisFileDirectory)..\..\lib\native\include;%(AdditionalIncludeDirectories) + $(MSBuildThisFileDirectory)..\..\build\native\include;%(AdditionalIncludeDirectories) TBB_USE_DEBUG;%(PreprocessorDefinitions) @@ -27,25 +27,25 @@ - $(MSBuildThisFileDirectory)..\..\lib\native\win-x86;%(AdditionalLibraryDirectories) + $(MSBuildThisFileDirectory)..\..\build\native\win-x86;%(AdditionalLibraryDirectories) tbb12.lib;tbbmalloc.lib;tbbmalloc_proxy.lib;%(AdditionalDependencies) - $(MSBuildThisFileDirectory)..\..\lib\native\win-x64;%(AdditionalLibraryDirectories) + $(MSBuildThisFileDirectory)..\..\build\native\win-x64;%(AdditionalLibraryDirectories) tbb12.lib;tbbmalloc.lib;tbbmalloc_proxy.lib;%(AdditionalDependencies) - $(MSBuildThisFileDirectory)..\..\lib\native\win-x86;%(AdditionalLibraryDirectories) + $(MSBuildThisFileDirectory)..\..\build\native\win-x86;%(AdditionalLibraryDirectories) tbb12_debug.lib;tbbmalloc_debug.lib;tbbmalloc_proxy_debug.lib;%(AdditionalDependencies) - $(MSBuildThisFileDirectory)..\..\lib\native\win-x64;%(AdditionalLibraryDirectories) + $(MSBuildThisFileDirectory)..\..\build\native\win-x64;%(AdditionalLibraryDirectories) tbb12_debug.lib;tbbmalloc_debug.lib;tbbmalloc_proxy_debug.lib;%(AdditionalDependencies) diff --git a/third-party/tbb/integration/windows/oneapi/vars.bat b/third-party/tbb/integration/windows/oneapi/vars.bat new file mode 100644 index 00000000..9c53c710 --- /dev/null +++ b/third-party/tbb/integration/windows/oneapi/vars.bat @@ -0,0 +1,56 @@ +@echo off +REM +REM Copyright (c) 2023 Intel Corporation +REM +REM Licensed under the Apache License, Version 2.0 (the "License"); +REM you may not use this file except in compliance with the License. +REM You may obtain a copy of the License at +REM +REM http://www.apache.org/licenses/LICENSE-2.0 +REM +REM Unless required by applicable law or agreed to in writing, software +REM distributed under the License is distributed on an "AS IS" BASIS, +REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +REM See the License for the specific language governing permissions and +REM limitations under the License. +REM + +if not defined SETVARS_CALL ( + echo: + echo :: ERROR: This script must be executed by setvars.bat. + echo: Try '[install-dir]\setvars.bat --help' for help. + echo: + exit /b 255 +) + +if not defined ONEAPI_ROOT ( + echo: + echo :: ERROR: This script requires that the ONEAPI_ROOT env variable is set." + echo: Try '[install-dir]\setvars.bat --help' for help. + echo: + exit /b 254 +) + +set "TBBROOT=%ONEAPI_ROOT%" + +:: Set the default arguments +set "TBB_TARGET_ARCH=%INTEL_TARGET_ARCH%" +set TBB_TARGET_VS= +set ARCH_SUFFIX= + +:ParseArgs +:: Parse the incoming arguments +if /i "%1"=="" goto SetEnv +if /i "%1"=="vs2019" (set TBB_TARGET_VS= ) & shift & goto ParseArgs +if /i "%1"=="vs2022" (set TBB_TARGET_VS= ) & shift & goto ParseArgs +if /i "%1"=="all" (set TBB_TARGET_VS=vc_mt) & shift & goto ParseArgs + +if "%TBB_TARGET_ARCH%"=="ia32" set ARCH_SUFFIX=32 + +:SetEnv +if exist "%TBBROOT%\bin%ARCH_SUFFIX%\%TBB_TARGET_VS%\tbb12.dll" ( + set "TBB_DLL_PATH=%TBBROOT%\bin%ARCH_SUFFIX%\%TBB_TARGET_VS%" +) + +:End +exit /B 0 diff --git a/third-party/tbb/python/CMakeLists.txt b/third-party/tbb/python/CMakeLists.txt index 33d2d081..748921a5 100644 --- a/third-party/tbb/python/CMakeLists.txt +++ b/third-party/tbb/python/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ find_package(PythonInterp 3.5 REQUIRED) -set(PYTHON_BUILD_WORK_DIR .) +set(PYTHON_BUILD_WORK_DIR python_build) add_custom_target( python_copy @@ -40,7 +40,7 @@ add_custom_target( ${PYTHON_EXECUTABLE} ${PYTHON_BUILD_WORK_DIR}/setup.py build -b${PYTHON_BUILD_WORK_DIR} build_ext ${TBB4PY_INCLUDE_STRING} -L$ - install --prefix ${PYTHON_BUILD_WORK_DIR}/build -f + install --prefix build -f COMMENT "Build and install to work directory the oneTBB Python module" ) diff --git a/third-party/tbb/python/README.md b/third-party/tbb/python/README.md index 59ec6a10..e7e3318d 100644 --- a/third-party/tbb/python/README.md +++ b/third-party/tbb/python/README.md @@ -13,7 +13,7 @@ parallelized using Intel® oneAPI Math Kernel Library or/and oneTBB. The module implements Pool class with the standard interface using oneTBB which can be used to replace Python's ThreadPool. Thanks to the monkey-patching technique implemented in class Monkey, no source code change is needed in order to enable threading composability in Python programs. -For more information and examples, please refer to [online blog](http://software.intel.com/en-us/blogs/2016/04/04unleash-parallel-performance-of-python-programs). +For more information and examples, please refer to [forum discussion](https://community.intel.com/t5/Intel-Distribution-for-Python/TBB-module-Unleash-parallel-performance-of-Python-programs/m-p/1074459). ## Directories - **rml** - The folder contains sources for building the plugin with cross-process dynamic thread scheduler implementation. diff --git a/third-party/tbb/python/setup.py b/third-party/tbb/python/setup.py index 7c050188..edf8580f 100644 --- a/third-party/tbb/python/setup.py +++ b/third-party/tbb/python/setup.py @@ -85,7 +85,7 @@ class TBBBuild(build): description ="Python API for oneTBB", long_description="Python API to Intel(R) oneAPI Threading Building Blocks library (oneTBB) " "extended with standard Pool implementation and monkey-patching", - url ="https://software.intel.com/en-us/intel-tbb", + url ="https://www.intel.com/content/www/us/en/developer/tools/oneapi/onetbb.html", author ="Intel Corporation", author_email="inteltbbdevelopers@intel.com", license ="Dual license: Apache or Proprietary", diff --git a/third-party/tbb/python/tbb/pool.py b/third-party/tbb/python/tbb/pool.py index a372324d..dd5c8190 100644 --- a/third-party/tbb/python/tbb/pool.py +++ b/third-party/tbb/python/tbb/pool.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016-2023 Intel Corporation +# Copyright (c) 2016-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -89,8 +89,8 @@ class Pool(object): def __init__(self, nworkers=0, name="Pool"): """ - \param nworkers (integer) number of worker threads to start - \param name (string) prefix for the worker threads' name + :param nworkers (integer) number of worker threads to start + :param name (string) prefix for the worker threads' name """ self._closed = False self._tasks = task_group() @@ -268,8 +268,8 @@ class Job: def __init__(self, func, args, kwds, apply_result): """ - \param func/args/kwds used to call the function - \param apply_result ApplyResult object that holds the result + :param func/args/kwds used to call the function + :param apply_result ApplyResult object that holds the result of the function call """ self._func = func @@ -317,10 +317,10 @@ class ApplyResult(object): def __init__(self, collector=None, callback=None): """ - \param collector when not None, the notify_ready() method of + :param collector when not None, the notify_ready() method of the collector will be called when the result from the Job is ready - \param callback when not None, function to call when the + :param callback when not None, function to call when the result becomes available (this is the parameter passed to the Pool::*_async() methods. """ @@ -404,7 +404,7 @@ class AbstractResultCollector(object): def __init__(self, to_notify): """ - \param to_notify ApplyResult object to notify when all the + :param to_notify ApplyResult object to notify when all the results we're waiting for become available. Can be None. """ self._to_notify = to_notify @@ -414,7 +414,7 @@ def register_result(self, apply_result): always be called BEFORE the Jobs get submitted to the work queue, and BEFORE the __iter__ and _get_result() methods can be called - \param apply_result ApplyResult object to add in our collection + :param apply_result ApplyResult object to add in our collection """ raise NotImplementedError("Children classes must implement it") @@ -422,7 +422,7 @@ def notify_ready(self, apply_result): """Called by the ApplyResult object (already registered via register_result()) that it is now ready (ie. the Job's result is available or an exception has been raised). - \param apply_result ApplyResult object telling us that the job + :param apply_result ApplyResult object telling us that the job has been processed """ raise NotImplementedError("Children classes must implement it") @@ -431,8 +431,8 @@ def _get_result(self, idx, timeout=None): """Called by the CollectorIterator object to retrieve the result's values one after another (order defined by the implementation) - \param idx The index of the result we want, wrt collector's order - \param timeout integer telling how long to wait (in seconds) + :param idx The index of the result we want, wrt collector's order + :param timeout integer telling how long to wait (in seconds) for the result at index idx to be available, or None (wait forever) """ @@ -450,7 +450,7 @@ class CollectorIterator(object): AbstractResultCollector::__iter__() method""" def __init__(self, collector): - """\param AbstractResultCollector instance""" + """:param AbstractResultCollector instance""" self._collector = collector self._idx = 0 @@ -486,7 +486,7 @@ class UnorderedResultCollector(AbstractResultCollector): def __init__(self, to_notify=None): """ - \param to_notify ApplyResult object to notify when all the + :param to_notify ApplyResult object to notify when all the results we're waiting for become available. Can be None. """ AbstractResultCollector.__init__(self, to_notify) @@ -499,7 +499,7 @@ def register_result(self, apply_result): always be called BEFORE the Jobs get submitted to the work queue, and BEFORE the __iter__ and _get_result() methods can be called - \param apply_result ApplyResult object to add in our collection + :param apply_result ApplyResult object to add in our collection """ self._expected += 1 @@ -507,8 +507,8 @@ def _get_result(self, idx, timeout=None): """Called by the CollectorIterator object to retrieve the result's values one after another, in the order the results have become available. - \param idx The index of the result we want, wrt collector's order - \param timeout integer telling how long to wait (in seconds) + :param idx The index of the result we want, wrt collector's order + :param timeout integer telling how long to wait (in seconds) for the result at index idx to be available, or None (wait forever) """ @@ -535,7 +535,7 @@ def notify_ready(self, apply_result=None): """Called by the ApplyResult object (already registered via register_result()) that it is now ready (ie. the Job's result is available or an exception has been raised). - \param apply_result ApplyResult object telling us that the job + :param apply_result ApplyResult object telling us that the job has been processed """ first_item = False @@ -560,9 +560,9 @@ class OrderedResultCollector(AbstractResultCollector): def __init__(self, to_notify=None, as_iterator=True): """ - \param to_notify ApplyResult object to notify when all the + :param to_notify ApplyResult object to notify when all the results we're waiting for become available. Can be None. - \param as_iterator boolean telling whether the result value + :param as_iterator boolean telling whether the result value set on to_notify should be an iterator (available as soon as 1 result arrived) or a list (available only after the last result arrived) @@ -578,7 +578,7 @@ def register_result(self, apply_result): always be called BEFORE the Jobs get submitted to the work queue, and BEFORE the __iter__ and _get_result() methods can be called - \param apply_result ApplyResult object to add in our collection + :param apply_result ApplyResult object to add in our collection """ self._results.append(apply_result) self._remaining += 1 @@ -587,8 +587,8 @@ def _get_result(self, idx, timeout=None): """Called by the CollectorIterator object to retrieve the result's values one after another (order defined by the implementation) - \param idx The index of the result we want, wrt collector's order - \param timeout integer telling how long to wait (in seconds) + :param idx The index of the result we want, wrt collector's order + :param timeout integer telling how long to wait (in seconds) for the result at index idx to be available, or None (wait forever) """ @@ -600,7 +600,7 @@ def notify_ready(self, apply_result): """Called by the ApplyResult object (already registered via register_result()) that it is now ready (ie. the Job's result is available or an exception has been raised). - \param apply_result ApplyResult object telling us that the job + :param apply_result ApplyResult object telling us that the job has been processed """ got_first = False diff --git a/third-party/tbb/src/tbb/CMakeLists.txt b/third-party/tbb/src/tbb/CMakeLists.txt index 6aade7db..b996c736 100644 --- a/third-party/tbb/src/tbb/CMakeLists.txt +++ b/third-party/tbb/src/tbb/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023 Intel Corporation +# Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ add_library(tbb itt_notify.cpp main.cpp market.cpp + tcm_adaptor.cpp misc.cpp misc_ex.cpp observer_proxy.cpp @@ -39,6 +40,9 @@ add_library(tbb task.cpp task_dispatcher.cpp task_group_context.cpp + thread_dispatcher.cpp + thread_request_serializer.cpp + threading_control.cpp version.cpp queuing_rw_mutex.cpp) @@ -56,6 +60,7 @@ target_compile_definitions(tbb $<$:TBB_USE_DEBUG> PRIVATE __TBB_BUILD + ${TBB_RESUMABLE_TASKS_USE_THREADS} $<$>:__TBB_DYNAMIC_LOAD_ENABLED=0> $<$>:__TBB_SOURCE_DIRECTLY_INCLUDED=1>) @@ -121,59 +126,71 @@ target_link_libraries(tbb ${TBB_COMMON_LINK_LIBS} ) -tbb_install_target(tbb) - -if (MSVC) - # Create a copy of target linker file (tbb[_debug].lib) with legacy name (tbb[_debug].lib) - # to support previous user experience for linkage. - install(FILES - $ - DESTINATION lib - CONFIGURATIONS RelWithDebInfo Release MinSizeRel - RENAME tbb.lib - COMPONENT devel - ) - - install(FILES - $ - DESTINATION lib - CONFIGURATIONS Debug - RENAME tbb_debug.lib - COMPONENT devel - ) -endif() - -set(_tbb_pc_lib_name tbb) - -if (WIN32) - set(_tbb_pc_lib_name ${_tbb_pc_lib_name}${TBB_BINARY_VERSION}) -endif() - -if (CMAKE_SIZEOF_VOID_P EQUAL 8) - set(TBB_PC_NAME tbb) -else() - set(TBB_PC_NAME tbb32) +if(TBB_BUILD_APPLE_FRAMEWORKS) + set_target_properties(tbb PROPERTIES + FRAMEWORK TRUE + FRAMEWORK_VERSION ${TBB_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} + XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER com.intel.tbb + MACOSX_FRAMEWORK_IDENTIFIER com.intel.tbb + MACOSX_FRAMEWORK_BUNDLE_VERSION ${TBB_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} + MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${TBB_BINARY_VERSION}) endif() -set(_prefix_for_pc_file "${CMAKE_INSTALL_PREFIX}") - -if (IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}") - set(_libdir_for_pc_file "${CMAKE_INSTALL_LIBDIR}") -else() - set(_libdir_for_pc_file "\${prefix}/${CMAKE_INSTALL_LIBDIR}") -endif() +tbb_install_target(tbb) -if (IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}") - set(_includedir_for_pc_file "${CMAKE_INSTALL_INCLUDEDIR}") -else() - set(_includedir_for_pc_file "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}") +if (TBB_INSTALL) + if (MSVC) + # Create a copy of target linker file (tbb[_debug].lib) with legacy name (tbb[_debug].lib) + # to support previous user experience for linkage. + install(FILES + $ + DESTINATION lib + CONFIGURATIONS RelWithDebInfo Release MinSizeRel + RENAME tbb.lib + COMPONENT devel + ) + + install(FILES + $ + DESTINATION lib + CONFIGURATIONS Debug + RENAME tbb_debug.lib + COMPONENT devel + ) + endif() + + set(_tbb_pc_lib_name tbb) + + if (WIN32) + set(_tbb_pc_lib_name ${_tbb_pc_lib_name}${TBB_BINARY_VERSION}) + endif() + + if (CMAKE_SIZEOF_VOID_P EQUAL 8) + set(TBB_PC_NAME tbb) + else() + set(TBB_PC_NAME tbb32) + endif() + + set(_prefix_for_pc_file "${CMAKE_INSTALL_PREFIX}") + + if (IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}") + set(_libdir_for_pc_file "${CMAKE_INSTALL_LIBDIR}") + else() + set(_libdir_for_pc_file "\${prefix}/${CMAKE_INSTALL_LIBDIR}") + endif() + + if (IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}") + set(_includedir_for_pc_file "${CMAKE_INSTALL_INCLUDEDIR}") + else() + set(_includedir_for_pc_file "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}") + endif() + + configure_file(${PROJECT_SOURCE_DIR}/integration/pkg-config/tbb.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${TBB_PC_NAME}.pc @ONLY) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${TBB_PC_NAME}.pc + DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/ + COMPONENT devel) endif() -configure_file(${PROJECT_SOURCE_DIR}/integration/pkg-config/tbb.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${TBB_PC_NAME}.pc @ONLY) -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${TBB_PC_NAME}.pc - DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/ - COMPONENT devel) - if (COMMAND tbb_gen_vars) tbb_gen_vars(tbb) endif() diff --git a/third-party/tbb/src/tbb/allocator.cpp b/third-party/tbb/src/tbb/allocator.cpp index 5453aeab..888f43fd 100644 --- a/third-party/tbb/src/tbb/allocator.cpp +++ b/third-party/tbb/src/tbb/allocator.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -118,7 +118,7 @@ static const dynamic_link_descriptor MallocLinkTable[] = { #if _WIN32||_WIN64 #define MALLOCLIB_NAME "tbbmalloc" DEBUG_SUFFIX ".dll" #elif __APPLE__ -#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".dylib" +#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".2.dylib" #elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX || __ANDROID__ #define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".so" #elif __unix__ // Note that order of these #elif's is important! diff --git a/third-party/tbb/src/tbb/arena.cpp b/third-party/tbb/src/tbb/arena.cpp index e79f689b..0e7cf43c 100644 --- a/third-party/tbb/src/tbb/arena.cpp +++ b/third-party/tbb/src/tbb/arena.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include "task_dispatcher.h" #include "governor.h" +#include "threading_control.h" #include "arena.h" #include "itt_notify.h" #include "semaphore.h" @@ -59,7 +60,6 @@ numa_binding_observer* construct_binding_observer( d1::task_arena* ta, int num_s if ((core_type >= 0 && core_type_count() > 1) || (numa_id >= 0 && numa_node_count() > 1) || max_threads_per_core > 0) { binding_observer = new(allocate_memory(sizeof(numa_binding_observer))) numa_binding_observer(ta, num_slots, numa_id, core_type, max_threads_per_core); __TBB_ASSERT(binding_observer, "Failure during NUMA binding observer allocation and construction"); - binding_observer->observe(true); } return binding_observer; } @@ -72,6 +72,83 @@ void destroy_binding_observer( numa_binding_observer* binding_observer ) { } #endif /*!__TBB_ARENA_BINDING*/ +void arena::on_thread_leaving(unsigned ref_param) { + // + // Implementation of arena destruction synchronization logic contained various + // bugs/flaws at the different stages of its evolution, so below is a detailed + // description of the issues taken into consideration in the framework of the + // current design. + // + // In case of using fire-and-forget tasks (scheduled via task::enqueue()) + // external thread is allowed to leave its arena before all its work is executed, + // and market may temporarily revoke all workers from this arena. Since revoked + // workers never attempt to reset arena state to EMPTY and cancel its request + // to RML for threads, the arena object is destroyed only when both the last + // thread is leaving it and arena's state is EMPTY (that is its external thread + // left and it does not contain any work). + // Thus resetting arena to EMPTY state (as earlier TBB versions did) should not + // be done here (or anywhere else in the external thread to that matter); doing so + // can result either in arena's premature destruction (at least without + // additional costly checks in workers) or in unnecessary arena state changes + // (and ensuing workers migration). + // + // A worker that checks for work presence and transitions arena to the EMPTY + // state (in snapshot taking procedure arena::out_of_work()) updates + // arena::my_pool_state first and only then arena::my_num_workers_requested. + // So the check for work absence must be done against the latter field. + // + // In a time window between decrementing the active threads count and checking + // if there is an outstanding request for workers. New worker thread may arrive, + // finish remaining work, set arena state to empty, and leave decrementing its + // refcount and destroying. Then the current thread will destroy the arena + // the second time. To preclude it a local copy of the outstanding request + // value can be stored before decrementing active threads count. + // + // But this technique may cause two other problem. When the stored request is + // zero, it is possible that arena still has threads and they can generate new + // tasks and thus re-establish non-zero requests. Then all the threads can be + // revoked (as described above) leaving this thread the last one, and causing + // it to destroy non-empty arena. + // + // The other problem takes place when the stored request is non-zero. Another + // thread may complete the work, set arena state to empty, and leave without + // arena destruction before this thread decrements the refcount. This thread + // cannot destroy the arena either. Thus the arena may be "orphaned". + // + // In both cases we cannot dereference arena pointer after the refcount is + // decremented, as our arena may already be destroyed. + // + // If this is the external thread, the market is protected by refcount to it. + // In case of workers market's liveness is ensured by the RML connection + // rundown protocol, according to which the client (i.e. the market) lives + // until RML server notifies it about connection termination, and this + // notification is fired only after all workers return into RML. + // + // Thus if we decremented refcount to zero we ask the market to check arena + // state (including the fact if it is alive) under the lock. + // + + __TBB_ASSERT(my_references.load(std::memory_order_relaxed) >= ref_param, "broken arena reference counter"); + + // When there is no workers someone must free arena, as + // without workers, no one calls out_of_work(). + if (ref_param == ref_external && !my_mandatory_concurrency.test()) { + out_of_work(); + } + + threading_control* tc = my_threading_control; + auto tc_client_snapshot = tc->prepare_client_destruction(my_tc_client); + // Release our reference to sync with destroy_client + unsigned remaining_ref = my_references.fetch_sub(ref_param, std::memory_order_release) - ref_param; + // do not access `this` it might be destroyed already + if (remaining_ref == 0) { + if (tc->try_destroy_client(tc_client_snapshot)) { + // We are requested to destroy ourself + free_arena(); + } + } +} + std::size_t arena::occupy_free_slot_in_range( thread_data& tls, std::size_t lower, std::size_t upper ) { if ( lower >= upper ) return out_of_arena; // Start search for an empty slot from the one we occupied the last time @@ -104,19 +181,22 @@ std::size_t arena::occupy_free_slot(thread_data& tls) { std::uintptr_t arena::calculate_stealing_threshold() { stack_anchor_type anchor; - return r1::calculate_stealing_threshold(reinterpret_cast(&anchor), my_market->worker_stack_size()); + return r1::calculate_stealing_threshold(reinterpret_cast(&anchor), my_threading_control->worker_stack_size()); } void arena::process(thread_data& tls) { governor::set_thread_data(tls); // TODO: consider moving to create_one_job. __TBB_ASSERT( is_alive(my_guard), nullptr); - __TBB_ASSERT( my_num_slots > 1, nullptr); + __TBB_ASSERT( my_num_slots >= 1, nullptr); std::size_t index = occupy_free_slot(tls); if (index == out_of_arena) { - on_thread_leaving(); + on_thread_leaving(ref_worker); return; } + + my_tc_client.get_pm_client()->register_thread(); + __TBB_ASSERT( index >= my_num_reserved_slots, "Workers cannot occupy reserved slots" ); tls.attach_arena(*this, index); // worker thread enters the dispatch loop to look for a work @@ -156,27 +236,27 @@ void arena::process(thread_data& tls) { __TBB_ASSERT(tls.my_inbox.is_idle_state(true), nullptr); __TBB_ASSERT(is_alive(my_guard), nullptr); + my_tc_client.get_pm_client()->unregister_thread(); + // In contrast to earlier versions of TBB (before 3.0 U5) now it is possible // that arena may be temporarily left unpopulated by threads. See comments in // arena::on_thread_leaving() for more details. - on_thread_leaving(); + on_thread_leaving(ref_worker); __TBB_ASSERT(tls.my_arena == this, "my_arena is used as a hint when searching the arena to join"); } -arena::arena ( market& m, unsigned num_slots, unsigned num_reserved_slots, unsigned priority_level ) -{ +arena::arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned priority_level) { __TBB_ASSERT( !my_guard, "improperly allocated arena?" ); __TBB_ASSERT( sizeof(my_slots[0]) % cache_line_size()==0, "arena::slot size not multiple of cache line size" ); __TBB_ASSERT( is_aligned(this, cache_line_size()), "arena misaligned" ); - my_market = &m; + my_threading_control = control; my_limit = 1; // Two slots are mandatory: for the external thread, and for 1 worker (required to support starvation resistant tasks). - my_num_slots = num_arena_slots(num_slots); + my_num_slots = num_arena_slots(num_slots, num_reserved_slots); my_num_reserved_slots = num_reserved_slots; my_max_num_workers = num_slots-num_reserved_slots; my_priority_level = priority_level; my_references = ref_external; // accounts for the external thread - my_aba_epoch = m.my_arenas_aba_epoch.load(std::memory_order_relaxed); my_observers.my_arena = this; my_co_cache.init(4 * num_slots); __TBB_ASSERT ( my_max_num_workers <= my_num_slots, nullptr); @@ -199,36 +279,29 @@ arena::arena ( market& m, unsigned num_slots, unsigned num_reserved_slots, unsig #if __TBB_PREVIEW_CRITICAL_TASKS my_critical_task_stream.initialize(my_num_slots); #endif -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - my_local_concurrency_requests = 0; - my_local_concurrency_flag.clear(); - my_global_concurrency_mode.store(false, std::memory_order_relaxed); -#endif + my_mandatory_requests = 0; } -arena& arena::allocate_arena( market& m, unsigned num_slots, unsigned num_reserved_slots, - unsigned priority_level ) +arena& arena::allocate_arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, + unsigned priority_level) { __TBB_ASSERT( sizeof(base_type) + sizeof(arena_slot) == sizeof(arena), "All arena data fields must go to arena_base" ); __TBB_ASSERT( sizeof(base_type) % cache_line_size() == 0, "arena slots area misaligned: wrong padding" ); __TBB_ASSERT( sizeof(mail_outbox) == max_nfs_size, "Mailbox padding is wrong" ); - std::size_t n = allocation_size(num_arena_slots(num_slots)); + std::size_t n = allocation_size(num_arena_slots(num_slots, num_reserved_slots)); unsigned char* storage = (unsigned char*)cache_aligned_allocate(n); // Zero all slots to indicate that they are empty std::memset( storage, 0, n ); - return *new( storage + num_arena_slots(num_slots) * sizeof(mail_outbox) ) - arena(m, num_slots, num_reserved_slots, priority_level); + + return *new( storage + num_arena_slots(num_slots, num_reserved_slots) * sizeof(mail_outbox) ) + arena(control, num_slots, num_reserved_slots, priority_level); } void arena::free_arena () { __TBB_ASSERT( is_alive(my_guard), nullptr); __TBB_ASSERT( !my_references.load(std::memory_order_relaxed), "There are threads in the dying arena" ); - __TBB_ASSERT( !my_num_workers_requested && !my_num_workers_allotted, "Dying arena requests workers" ); - __TBB_ASSERT( my_pool_state.load(std::memory_order_relaxed) == SNAPSHOT_EMPTY || !my_max_num_workers, - "Inconsistent state of a dying arena" ); -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - __TBB_ASSERT( !my_global_concurrency_mode, nullptr); -#endif + __TBB_ASSERT( !my_total_num_workers_requested && !my_num_workers_allotted, "Dying arena requests workers" ); + __TBB_ASSERT( is_empty(), "Inconsistent state of a dying arena" ); #if __TBB_ARENA_BINDING if (my_numa_binding_observer != nullptr) { destroy_binding_observer(my_numa_binding_observer); @@ -254,15 +327,11 @@ void arena::free_arena () { #if __TBB_PREVIEW_CRITICAL_TASKS __TBB_ASSERT( my_critical_task_stream.empty(), "Not all critical tasks were executed"); #endif - // remove an internal reference - my_market->release( /*is_public=*/false, /*blocking_terminate=*/false ); - // Clear enfources synchronization with observe(false) my_observers.clear(); void* storage = &mailbox(my_num_slots-1); __TBB_ASSERT( my_references.load(std::memory_order_relaxed) == 0, nullptr); - __TBB_ASSERT( my_pool_state.load(std::memory_order_relaxed) == SNAPSHOT_EMPTY || !my_max_num_workers, nullptr); this->~arena(); #if TBB_USE_ASSERT > 1 std::memset( storage, 0, allocation_size(my_num_slots) ); @@ -274,78 +343,100 @@ bool arena::has_enqueued_tasks() { return !my_fifo_task_stream.empty(); } -bool arena::is_out_of_work() { -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - if (my_local_concurrency_flag.try_clear_if([this] { - return !has_enqueued_tasks(); - })) { - my_market->adjust_demand(*this, /* delta = */ -1, /* mandatory = */ true); +void arena::request_workers(int mandatory_delta, int workers_delta, bool wakeup_threads) { + my_threading_control->adjust_demand(my_tc_client, mandatory_delta, workers_delta); + + if (wakeup_threads) { + // Notify all sleeping threads that work has appeared in the arena. + get_waiting_threads_monitor().notify([&] (market_context context) { + return this == context.my_arena_addr; + }); } -#endif +} +bool arena::has_tasks() { // TODO: rework it to return at least a hint about where a task was found; better if the task itself. - switch (my_pool_state.load(std::memory_order_acquire)) { - case SNAPSHOT_EMPTY: - return true; - case SNAPSHOT_FULL: { - // Use unique id for "busy" in order to avoid ABA problems. - const pool_state_t busy = pool_state_t(&busy); - // Helper for CAS execution - pool_state_t expected_state; - - // Request permission to take snapshot - expected_state = SNAPSHOT_FULL; - if (my_pool_state.compare_exchange_strong(expected_state, busy)) { - // Got permission. Take the snapshot. - // NOTE: This is not a lock, as the state can be set to FULL at - // any moment by a thread that spawns/enqueues new task. - std::size_t n = my_limit.load(std::memory_order_acquire); - // Make local copies of volatile parameters. Their change during - // snapshot taking procedure invalidates the attempt, and returns - // this thread into the dispatch loop. - std::size_t k; - for (k = 0; k < n; ++k) { - if (my_slots[k].task_pool.load(std::memory_order_relaxed) != EmptyTaskPool && - my_slots[k].head.load(std::memory_order_relaxed) < my_slots[k].tail.load(std::memory_order_relaxed)) - { - // k-th primary task pool is nonempty and does contain tasks. - break; - } - if (my_pool_state.load(std::memory_order_acquire) != busy) - return false; // the work was published - } - bool work_absent = k == n; - // Test and test-and-set. - if (my_pool_state.load(std::memory_order_acquire) == busy) { - bool no_stream_tasks = !has_enqueued_tasks() && my_resume_task_stream.empty(); + std::size_t n = my_limit.load(std::memory_order_acquire); + bool tasks_are_available = false; + for (std::size_t k = 0; k < n && !tasks_are_available; ++k) { + tasks_are_available = !my_slots[k].is_empty(); + } + tasks_are_available = tasks_are_available || has_enqueued_tasks() || !my_resume_task_stream.empty(); #if __TBB_PREVIEW_CRITICAL_TASKS - no_stream_tasks = no_stream_tasks && my_critical_task_stream.empty(); + tasks_are_available = tasks_are_available || !my_critical_task_stream.empty(); #endif - work_absent = work_absent && no_stream_tasks; - if (work_absent) { - // save current demand value before setting SNAPSHOT_EMPTY, - // to avoid race with advertise_new_work. - int current_demand = (int)my_max_num_workers; - expected_state = busy; - if (my_pool_state.compare_exchange_strong(expected_state, SNAPSHOT_EMPTY)) { - // This thread transitioned pool to empty state, and thus is - // responsible for telling the market that there is no work to do. - my_market->adjust_demand(*this, -current_demand, /* mandatory = */ false); - return true; - } - return false; - } - // Undo previous transition SNAPSHOT_FULL-->busy, unless another thread undid it. - expected_state = busy; - my_pool_state.compare_exchange_strong(expected_state, SNAPSHOT_FULL); - } + return tasks_are_available; +} + +void arena::out_of_work() { + // We should try unset my_pool_state first due to keep arena invariants in consistent state + // Otherwise, we might have my_pool_state = false and my_mandatory_concurrency = true that is broken invariant + bool disable_mandatory = my_mandatory_concurrency.try_clear_if([this] { return !has_enqueued_tasks(); }); + bool release_workers = my_pool_state.try_clear_if([this] { return !has_tasks(); }); + + if (disable_mandatory || release_workers) { + int mandatory_delta = disable_mandatory ? -1 : 0; + int workers_delta = release_workers ? -(int)my_max_num_workers : 0; + + if (disable_mandatory && is_arena_workerless()) { + // We had set workers_delta to 1 when enabled mandatory concurrency, so revert it now + workers_delta = -1; } - return false; + request_workers(mandatory_delta, workers_delta); } - default: - // Another thread is taking a snapshot. - return false; +} + +void arena::set_top_priority(bool is_top_priority) { + my_is_top_priority.store(is_top_priority, std::memory_order_relaxed); +} + +bool arena::is_top_priority() const { + return my_is_top_priority.load(std::memory_order_relaxed); +} + +bool arena::try_join() { + if (is_joinable()) { + my_references += arena::ref_worker; + return true; + } + return false; +} + +void arena::set_allotment(unsigned allotment) { + if (my_num_workers_allotted.load(std::memory_order_relaxed) != allotment) { + my_num_workers_allotted.store(allotment, std::memory_order_relaxed); + } +} + +int arena::update_concurrency(unsigned allotment) { + int delta = allotment - my_num_workers_allotted.load(std::memory_order_relaxed); + if (delta != 0) { + my_num_workers_allotted.store(allotment, std::memory_order_relaxed); } + return delta; +} + +std::pair arena::update_request(int mandatory_delta, int workers_delta) { + __TBB_ASSERT(-1 <= mandatory_delta && mandatory_delta <= 1, nullptr); + + int min_workers_request = 0; + int max_workers_request = 0; + + // Calculate min request + my_mandatory_requests += mandatory_delta; + min_workers_request = my_mandatory_requests > 0 ? 1 : 0; + + // Calculate max request + my_total_num_workers_requested += workers_delta; + // Clamp worker request into interval [0, my_max_num_workers] + max_workers_request = clamp(my_total_num_workers_requested, 0, + min_workers_request > 0 && is_arena_workerless() ? 1 : (int)my_max_num_workers); + + return { min_workers_request, max_workers_request }; +} + +thread_control_monitor& arena::get_waiting_threads_monitor() { + return my_threading_control->get_waiting_threads_monitor(); } void arena::enqueue_task(d1::task& t, d1::task_group_context& ctx, thread_data& td) { @@ -356,6 +447,17 @@ void arena::enqueue_task(d1::task& t, d1::task_group_context& ctx, thread_data& advertise_new_work(); } +arena& arena::create(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned arena_priority_level, d1::constraints constraints) { + __TBB_ASSERT(num_slots > 0, NULL); + __TBB_ASSERT(num_reserved_slots <= num_slots, NULL); + // Add public market reference for an external thread/task_arena (that adds an internal reference in exchange). + arena& a = arena::allocate_arena(control, num_slots, num_reserved_slots, arena_priority_level); + a.my_tc_client = control->create_client(a); + // We should not publish arena until all fields are initialized + control->publish_client(a.my_tc_client, constraints); + return a; +} + } // namespace r1 } // namespace detail } // namespace tbb @@ -382,12 +484,12 @@ void assert_arena_priority_valid( tbb::task_arena::priority ) {} unsigned arena_priority_level( tbb::task_arena::priority a_priority ) { assert_arena_priority_valid( a_priority ); - return market::num_priority_levels - unsigned(int(a_priority) / d1::priority_stride); + return d1::num_priority_levels - unsigned(int(a_priority) / d1::priority_stride); } tbb::task_arena::priority arena_priority( unsigned priority_level ) { auto priority = tbb::task_arena::priority( - (market::num_priority_levels - priority_level) * d1::priority_stride + (d1::num_priority_levels - priority_level) * d1::priority_stride ); assert_arena_priority_valid( priority ); return priority; @@ -434,35 +536,54 @@ void __TBB_EXPORTED_FUNC enqueue(d1::task& t, d1::task_group_context& ctx, d1::t void task_arena_impl::initialize(d1::task_arena_base& ta) { // Enforce global market initialization to properly initialize soft limit (void)governor::get_thread_data(); + d1::constraints arena_constraints; + +#if __TBB_ARENA_BINDING + arena_constraints = d1::constraints{} + .set_core_type(ta.core_type()) + .set_max_threads_per_core(ta.max_threads_per_core()) + .set_numa_id(ta.my_numa_id); +#endif /*__TBB_ARENA_BINDING*/ + if (ta.my_max_concurrency < 1) { #if __TBB_ARENA_BINDING - d1::constraints arena_constraints = d1::constraints{} - .set_core_type(ta.core_type()) - .set_max_threads_per_core(ta.max_threads_per_core()) - .set_numa_id(ta.my_numa_id); ta.my_max_concurrency = (int)default_concurrency(arena_constraints); #else /*!__TBB_ARENA_BINDING*/ ta.my_max_concurrency = (int)governor::default_num_threads(); #endif /*!__TBB_ARENA_BINDING*/ } +#if __TBB_CPUBIND_PRESENT + numa_binding_observer* observer = construct_binding_observer( + static_cast(&ta), arena::num_arena_slots(ta.my_max_concurrency, ta.my_num_reserved_slots), + ta.my_numa_id, ta.core_type(), ta.max_threads_per_core()); + if (observer) { + // TODO: Consider lazy initialization for internal arena so + // the direct calls to observer might be omitted until actual initialization. + observer->on_scheduler_entry(true); + } +#endif /*__TBB_CPUBIND_PRESENT*/ + __TBB_ASSERT(ta.my_arena.load(std::memory_order_relaxed) == nullptr, "Arena already initialized"); unsigned priority_level = arena_priority_level(ta.my_priority); - arena* a = market::create_arena(ta.my_max_concurrency, ta.my_num_reserved_slots, priority_level, /* stack_size = */ 0); - ta.my_arena.store(a, std::memory_order_release); - // add an internal market reference; a public reference was added in create_arena - market::global_market( /*is_public=*/false); -#if __TBB_ARENA_BINDING - a->my_numa_binding_observer = construct_binding_observer( - static_cast(&ta), a->my_num_slots, ta.my_numa_id, ta.core_type(), ta.max_threads_per_core()); -#endif /*__TBB_ARENA_BINDING*/ + threading_control* thr_control = threading_control::register_public_reference(); + arena& a = arena::create(thr_control, unsigned(ta.my_max_concurrency), ta.my_num_reserved_slots, priority_level, arena_constraints); + + ta.my_arena.store(&a, std::memory_order_release); +#if __TBB_CPUBIND_PRESENT + a.my_numa_binding_observer = observer; + if (observer) { + observer->on_scheduler_exit(true); + observer->observe(true); + } +#endif /*__TBB_CPUBIND_PRESENT*/ } void task_arena_impl::terminate(d1::task_arena_base& ta) { arena* a = ta.my_arena.load(std::memory_order_relaxed); assert_pointer_valid(a); - a->my_market->release( /*is_public=*/true, /*blocking_terminate=*/false ); - a->on_thread_leaving(); + threading_control::unregister_public_reference(/*blocking_terminate=*/false); + a->on_thread_leaving(arena::ref_external); ta.my_arena.store(nullptr, std::memory_order_relaxed); } @@ -478,10 +599,10 @@ bool task_arena_impl::attach(d1::task_arena_base& ta) { ta.my_num_reserved_slots = a->my_num_reserved_slots; ta.my_priority = arena_priority(a->my_priority_level); ta.my_max_concurrency = ta.my_num_reserved_slots + a->my_max_num_workers; - __TBB_ASSERT(arena::num_arena_slots(ta.my_max_concurrency) == a->my_num_slots, nullptr); + __TBB_ASSERT(arena::num_arena_slots(ta.my_max_concurrency, ta.my_num_reserved_slots) == a->my_num_slots, nullptr); ta.my_arena.store(a, std::memory_order_release); - // increases market's ref count for task_arena - market::global_market( /*is_public=*/true ); + // increases threading_control's ref count for task_arena + threading_control::register_public_reference(); return true; } return false; @@ -523,7 +644,7 @@ class nested_arena_context : no_copy { // If the calling thread occupies the slots out of external thread reserve we need to notify the // market that this arena requires one worker less. if (td.my_arena_index >= td.my_arena->my_num_reserved_slots) { - td.my_arena->my_market->adjust_demand(*td.my_arena, /* delta = */ -1, /* mandatory = */ false); + td.my_arena->request_workers(/* mandatory_delta = */ 0, /* workers_delta = */ -1); } td.my_last_observer = nullptr; @@ -559,7 +680,7 @@ class nested_arena_context : no_copy { // Notify the market that this thread releasing a one slot // that can be used by a worker thread. if (td.my_arena_index >= td.my_arena->my_num_reserved_slots) { - td.my_arena->my_market->adjust_demand(*td.my_arena, /* delta = */ 1, /* mandatory = */ false); + td.my_arena->request_workers(/* mandatory_delta = */ 0, /* workers_delta = */ 1); } td.leave_task_dispatcher(); @@ -613,7 +734,7 @@ class delegated_task : public d1::task { } void finalize() { m_wait_ctx.release(); // must precede the wakeup - m_monitor.notify([this](std::uintptr_t ctx) { + m_monitor.notify([this] (std::uintptr_t ctx) { return ctx == std::uintptr_t(&m_delegate); }); // do not relax, it needs a fence! m_completed.store(true, std::memory_order_release); @@ -702,7 +823,7 @@ void task_arena_impl::wait(d1::task_arena_base& ta) { __TBB_ASSERT_EX(td, "Scheduler is not initialized"); __TBB_ASSERT(td->my_arena != a || td->my_arena_index == 0, "internal_wait is not supported within a worker context" ); if (a->my_max_num_workers != 0) { - while (a->num_workers_active() || a->my_pool_state.load(std::memory_order_acquire) != arena::SNAPSHOT_EMPTY) { + while (a->num_workers_active() || !a->is_empty()) { yield(); } } @@ -717,11 +838,11 @@ int task_arena_impl::max_concurrency(const d1::task_arena_base *ta) { if( a ) { // Get parameters from the arena __TBB_ASSERT( !ta || ta->my_max_concurrency==1, nullptr); - return a->my_num_reserved_slots + a->my_max_num_workers -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - + (a->my_local_concurrency_flag.test() ? 1 : 0) -#endif - ; + int mandatory_worker = 0; + if (a->is_arena_workerless() && a->my_num_reserved_slots == 1) { + mandatory_worker = a->my_mandatory_concurrency.test() ? 1 : 0; + } + return a->my_num_reserved_slots + a->my_max_num_workers + mandatory_worker; } if (ta && ta->my_max_concurrency == 1) { diff --git a/third-party/tbb/src/tbb/arena.h b/third-party/tbb/src/tbb/arena.h index 0f4165d5..1e95f117 100644 --- a/third-party/tbb/src/tbb/arena.h +++ b/third-party/tbb/src/tbb/arena.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,6 +21,8 @@ #include #include "oneapi/tbb/detail/_task.h" +#include "oneapi/tbb/detail/_utils.h" +#include "oneapi/tbb/spin_mutex.h" #include "scheduler_common.h" #include "intrusive_list.h" @@ -28,11 +30,11 @@ #include "arena_slot.h" #include "rml_tbb.h" #include "mailbox.h" -#include "market.h" #include "governor.h" #include "concurrent_monitor.h" #include "observer_proxy.h" -#include "oneapi/tbb/spin_mutex.h" +#include "thread_control_monitor.h" +#include "threading_control_client.h" namespace tbb { namespace detail { @@ -40,6 +42,7 @@ namespace r1 { class task_dispatcher; class task_group_context; +class threading_control; class allocate_root_with_context_proxy; #if __TBB_ARENA_BINDING @@ -133,11 +136,10 @@ struct stack_anchor_type { stack_anchor_type(const stack_anchor_type&) = delete; }; -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY class atomic_flag { static const std::uintptr_t SET = 1; - static const std::uintptr_t EMPTY = 0; - std::atomic my_state; + static const std::uintptr_t UNSET = 0; + std::atomic my_state{UNSET}; public: bool test_and_set() { std::uintptr_t state = my_state.load(std::memory_order_acquire); @@ -149,13 +151,13 @@ class atomic_flag { // We interrupted clear transaction return false; } - if (state != EMPTY) { + if (state != UNSET) { // We lost our epoch return false; } // We are too late but still in the same epoch __TBB_fallthrough; - case EMPTY: + case UNSET: return my_state.compare_exchange_strong(state, SET); } } @@ -165,21 +167,17 @@ class atomic_flag { std::uintptr_t state = my_state.load(std::memory_order_acquire); if (state == SET && my_state.compare_exchange_strong(state, busy)) { if (pred()) { - return my_state.compare_exchange_strong(busy, EMPTY); + return my_state.compare_exchange_strong(busy, UNSET); } // The result of the next operation is discarded, always false should be returned. my_state.compare_exchange_strong(busy, SET); } return false; } - void clear() { - my_state.store(EMPTY, std::memory_order_release); - } - bool test() { - return my_state.load(std::memory_order_acquire) != EMPTY; + bool test(std::memory_order order = std::memory_order_acquire) { + return my_state.load(order) != UNSET; } }; -#endif //! The structure of an arena, except the array of slots. /** Separated in order to simplify padding. @@ -220,60 +218,41 @@ struct arena_base : padded { //! The total number of workers that are requested from the resource manager. int my_total_num_workers_requested; - //! The number of workers that are really requested from the resource manager. - //! Possible values are in [0, my_max_num_workers] - int my_num_workers_requested; - //! The index in the array of per priority lists of arenas this object is in. /*const*/ unsigned my_priority_level; - //! The max priority level of arena in market. + //! The max priority level of arena in permit manager. std::atomic my_is_top_priority{false}; //! Current task pool state and estimate of available tasks amount. - /** The estimate is either 0 (SNAPSHOT_EMPTY) or infinity (SNAPSHOT_FULL). - Special state is "busy" (any other unsigned value). - Note that the implementation of arena::is_busy_or_empty() requires - my_pool_state to be unsigned. */ - using pool_state_t = std::uintptr_t ; - std::atomic my_pool_state; + atomic_flag my_pool_state; //! The list of local observers attached to this arena. observer_list my_observers; #if __TBB_ARENA_BINDING //! Pointer to internal observer that allows to bind threads in arena to certain NUMA node. - numa_binding_observer* my_numa_binding_observer; + numa_binding_observer* my_numa_binding_observer{nullptr}; #endif /*__TBB_ARENA_BINDING*/ // Below are rarely modified members - //! The market that owns this arena. - market* my_market; + threading_control* my_threading_control; //! Default task group context. d1::task_group_context* my_default_ctx; -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - // arena needs an extra worker despite a global limit - std::atomic my_global_concurrency_mode; -#endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY */ - //! Waiting object for external threads that cannot join the arena. concurrent_monitor my_exit_monitors; //! Coroutines (task_dispathers) cache buffer arena_co_cache my_co_cache; -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY // arena needs an extra worker despite the arena limit - atomic_flag my_local_concurrency_flag; + atomic_flag my_mandatory_concurrency; // the number of local mandatory concurrency requests - int my_local_concurrency_requests; -#endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY*/ + int my_mandatory_requests; - //! ABA prevention marker. - std::uintptr_t my_aba_epoch; //! The number of slots in the arena. unsigned my_num_slots; //! The number of reserved slots (can be occupied only by external threads). @@ -281,11 +260,7 @@ struct arena_base : padded { //! The number of workers requested by the external thread owning the arena. unsigned my_max_num_workers; - //! The target serialization epoch for callers of adjust_job_count_estimate - int my_adjust_demand_target_epoch; - - //! The current serialization epoch for callers of adjust_job_count_estimate - d1::waitable_atomic my_adjust_demand_current_epoch; + threading_control_client my_tc_client; #if TBB_USE_ASSERT //! Used to trap accesses to the object after its destruction. @@ -306,17 +281,19 @@ class arena: public padded }; //! Constructor - arena ( market& m, unsigned max_num_workers, unsigned num_reserved_slots, unsigned priority_level); + arena(threading_control* control, unsigned max_num_workers, unsigned num_reserved_slots, unsigned priority_level); //! Allocate an instance of arena. - static arena& allocate_arena( market& m, unsigned num_slots, unsigned num_reserved_slots, - unsigned priority_level ); + static arena& allocate_arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, + unsigned priority_level); + + static arena& create(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned arena_priority_level, d1::constraints constraints = d1::constraints{}); - static int unsigned num_arena_slots ( unsigned num_slots ) { - return max(2u, num_slots); + static int unsigned num_arena_slots ( unsigned num_slots, unsigned num_reserved_slots ) { + return num_reserved_slots == 0 ? num_slots : max(2u, num_slots); } - static int allocation_size ( unsigned num_slots ) { + static int allocation_size( unsigned num_slots ) { return sizeof(base_type) + num_slots * (sizeof(mail_outbox) + sizeof(arena_slot) + sizeof(task_dispatcher)); } @@ -328,13 +305,7 @@ class arena: public padded } //! Completes arena shutdown, destructs and deallocates it. - void free_arena (); - - //! No tasks to steal since last snapshot was taken - static const pool_state_t SNAPSHOT_EMPTY = 0; - - //! At least one task has been offered for stealing since the last snapshot started - static const pool_state_t SNAPSHOT_FULL = pool_state_t(-1); + void free_arena(); //! The number of least significant bits for external references static const unsigned ref_external_bits = 12; // up to 4095 external and 1M workers @@ -343,9 +314,6 @@ class arena: public padded static const unsigned ref_external = 1; static const unsigned ref_worker = 1 << ref_external_bits; - //! No tasks to steal or snapshot is being taken. - static bool is_busy_or_empty( pool_state_t s ) { return s < SNAPSHOT_FULL; } - //! The number of workers active in the arena. unsigned num_workers_active() const { return my_references.load(std::memory_order_acquire) >> ref_external_bits; @@ -356,6 +324,8 @@ class arena: public padded return num_workers_active() > my_num_workers_allotted.load(std::memory_order_relaxed); } + void request_workers(int mandatory_delta, int workers_delta, bool wakeup_threads = false); + //! If necessary, raise a flag that there is new job in arena. template void advertise_new_work(); @@ -372,8 +342,7 @@ class arena: public padded #endif //! Check if there is job anywhere in arena. - /** Return true if no job or if arena is being cleaned up. */ - bool is_out_of_work(); + void out_of_work(); //! enqueue a task into starvation-resistance queue void enqueue_task(d1::task&, d1::task_group_context&, thread_data&); @@ -382,12 +351,19 @@ class arena: public padded void process(thread_data&); //! Notification that the thread leaves its arena - template - inline void on_thread_leaving ( ); - //! Check for the presence of enqueued tasks at all priority levels + void on_thread_leaving(unsigned ref_param); + + //! Check for the presence of enqueued tasks bool has_enqueued_tasks(); + //! Check for the presence of any tasks + bool has_tasks(); + + bool is_empty() { return my_pool_state.test() == /* EMPTY */ false; } + + thread_control_monitor& get_waiting_threads_monitor(); + static const std::size_t out_of_arena = ~size_t(0); //! Tries to occupy a slot in the arena. On success, returns the slot index; if no slot is available, returns out_of_arena. template @@ -397,158 +373,67 @@ class arena: public padded std::uintptr_t calculate_stealing_threshold(); - /** Must be the last data field */ - arena_slot my_slots[1]; -}; // class arena + unsigned priority_level() { return my_priority_level; } -template -inline void arena::on_thread_leaving ( ) { - // - // Implementation of arena destruction synchronization logic contained various - // bugs/flaws at the different stages of its evolution, so below is a detailed - // description of the issues taken into consideration in the framework of the - // current design. - // - // In case of using fire-and-forget tasks (scheduled via task::enqueue()) - // external thread is allowed to leave its arena before all its work is executed, - // and market may temporarily revoke all workers from this arena. Since revoked - // workers never attempt to reset arena state to EMPTY and cancel its request - // to RML for threads, the arena object is destroyed only when both the last - // thread is leaving it and arena's state is EMPTY (that is its external thread - // left and it does not contain any work). - // Thus resetting arena to EMPTY state (as earlier TBB versions did) should not - // be done here (or anywhere else in the external thread to that matter); doing so - // can result either in arena's premature destruction (at least without - // additional costly checks in workers) or in unnecessary arena state changes - // (and ensuing workers migration). - // - // A worker that checks for work presence and transitions arena to the EMPTY - // state (in snapshot taking procedure arena::is_out_of_work()) updates - // arena::my_pool_state first and only then arena::my_num_workers_requested. - // So the check for work absence must be done against the latter field. - // - // In a time window between decrementing the active threads count and checking - // if there is an outstanding request for workers. New worker thread may arrive, - // finish remaining work, set arena state to empty, and leave decrementing its - // refcount and destroying. Then the current thread will destroy the arena - // the second time. To preclude it a local copy of the outstanding request - // value can be stored before decrementing active threads count. - // - // But this technique may cause two other problem. When the stored request is - // zero, it is possible that arena still has threads and they can generate new - // tasks and thus re-establish non-zero requests. Then all the threads can be - // revoked (as described above) leaving this thread the last one, and causing - // it to destroy non-empty arena. - // - // The other problem takes place when the stored request is non-zero. Another - // thread may complete the work, set arena state to empty, and leave without - // arena destruction before this thread decrements the refcount. This thread - // cannot destroy the arena either. Thus the arena may be "orphaned". - // - // In both cases we cannot dereference arena pointer after the refcount is - // decremented, as our arena may already be destroyed. - // - // If this is the external thread, the market is protected by refcount to it. - // In case of workers market's liveness is ensured by the RML connection - // rundown protocol, according to which the client (i.e. the market) lives - // until RML server notifies it about connection termination, and this - // notification is fired only after all workers return into RML. - // - // Thus if we decremented refcount to zero we ask the market to check arena - // state (including the fact if it is alive) under the lock. - // - std::uintptr_t aba_epoch = my_aba_epoch; - unsigned priority_level = my_priority_level; - market* m = my_market; - __TBB_ASSERT(my_references.load(std::memory_order_relaxed) >= ref_param, "broken arena reference counter"); -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - // When there is no workers someone must free arena, as - // without workers, no one calls is_out_of_work(). - // Skip workerless arenas because they have no demand for workers. - // TODO: consider more strict conditions for the cleanup, - // because it can create the demand of workers, - // but the arena can be already empty (and so ready for destroying) - // TODO: Fix the race: while we check soft limit and it might be changed. - if( ref_param==ref_external && my_num_slots != my_num_reserved_slots - && 0 == m->my_num_workers_soft_limit.load(std::memory_order_relaxed) && - !my_global_concurrency_mode.load(std::memory_order_relaxed) ) { - is_out_of_work(); - // We expect, that in worst case it's enough to have num_priority_levels-1 - // calls to restore priorities and yet another is_out_of_work() to conform - // that no work was found. But as market::set_active_num_workers() can be called - // concurrently, can't guarantee last is_out_of_work() return true. - } -#endif + bool has_request() { return my_total_num_workers_requested; } + + unsigned references() const { return my_references.load(std::memory_order_acquire); } + + bool is_arena_workerless() const { return my_max_num_workers == 0; } - // Release our reference to sync with arena destroy - unsigned remaining_ref = my_references.fetch_sub(ref_param, std::memory_order_release) - ref_param; - if (remaining_ref == 0) { - m->try_destroy_arena( this, aba_epoch, priority_level ); + void set_top_priority(bool); + + bool is_top_priority() const; + + bool is_joinable() const { + return num_workers_active() < my_num_workers_allotted.load(std::memory_order_relaxed); } -} -template -void arena::advertise_new_work() { - auto is_related_arena = [&] (market_context context) { - return this == context.my_arena_addr; - }; + bool try_join(); - if( work_type == work_enqueued ) { - atomic_fence_seq_cst(); -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - if ( my_market->my_num_workers_soft_limit.load(std::memory_order_acquire) == 0 && - my_global_concurrency_mode.load(std::memory_order_acquire) == false ) - my_market->enable_mandatory_concurrency(this); + void set_allotment(unsigned allotment); - if (my_max_num_workers == 0 && my_num_reserved_slots == 1 && my_local_concurrency_flag.test_and_set()) { - my_market->adjust_demand(*this, /* delta = */ 1, /* mandatory = */ true); - } -#endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY */ + int update_concurrency(unsigned concurrency); + + std::pair update_request(int mandatory_delta, int workers_delta); + + /** Must be the last data field */ + arena_slot my_slots[1]; +}; // class arena + +template +void arena::advertise_new_work() { + bool is_mandatory_needed = false; + bool are_workers_needed = false; + + if (work_type != work_spawned) { // Local memory fence here and below is required to avoid missed wakeups; see the comment below. // Starvation resistant tasks require concurrency, so missed wakeups are unacceptable. - } - else if( work_type == wakeup ) { atomic_fence_seq_cst(); } + if (work_type == work_enqueued && my_num_slots > my_num_reserved_slots) { + is_mandatory_needed = my_mandatory_concurrency.test_and_set(); + } + // Double-check idiom that, in case of spawning, is deliberately sloppy about memory fences. // Technically, to avoid missed wakeups, there should be a full memory fence between the point we // released the task pool (i.e. spawned task) and read the arena's state. However, adding such a // fence might hurt overall performance more than it helps, because the fence would be executed // on every task pool release, even when stealing does not occur. Since TBB allows parallelism, // but never promises parallelism, the missed wakeup is not a correctness problem. - pool_state_t snapshot = my_pool_state.load(std::memory_order_acquire); - if( is_busy_or_empty(snapshot) ) { - // Attempt to mark as full. The compare_and_swap below is a little unusual because the - // result is compared to a value that can be different than the comparand argument. - pool_state_t expected_state = snapshot; - my_pool_state.compare_exchange_strong( expected_state, SNAPSHOT_FULL ); - if( expected_state == SNAPSHOT_EMPTY ) { - if( snapshot != SNAPSHOT_EMPTY ) { - // This thread read "busy" into snapshot, and then another thread transitioned - // my_pool_state to "empty" in the meantime, which caused the compare_and_swap above - // to fail. Attempt to transition my_pool_state from "empty" to "full". - expected_state = SNAPSHOT_EMPTY; - if( !my_pool_state.compare_exchange_strong( expected_state, SNAPSHOT_FULL ) ) { - // Some other thread transitioned my_pool_state from "empty", and hence became - // responsible for waking up workers. - return; - } - } - // This thread transitioned pool from empty to full state, and thus is responsible for - // telling the market that there is work to do. -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - if( work_type == work_spawned ) { - if ( my_global_concurrency_mode.load(std::memory_order_acquire) == true ) - my_market->mandatory_concurrency_disable( this ); - } -#endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY */ - // TODO: investigate adjusting of arena's demand by a single worker. - my_market->adjust_demand(*this, my_max_num_workers, /* mandatory = */ false); + are_workers_needed = my_pool_state.test_and_set(); - // Notify all sleeping threads that work has appeared in the arena. - my_market->get_wait_list().notify(is_related_arena); + if (is_mandatory_needed || are_workers_needed) { + int mandatory_delta = is_mandatory_needed ? 1 : 0; + int workers_delta = are_workers_needed ? my_max_num_workers : 0; + + if (is_mandatory_needed && is_arena_workerless()) { + // Set workers_delta to 1 to keep arena invariants consistent + workers_delta = 1; } + + request_workers(mandatory_delta, workers_delta, /* wakeup_threads = */ true); } } diff --git a/third-party/tbb/src/tbb/arena_slot.h b/third-party/tbb/src/tbb/arena_slot.h index cdd91902..c526e474 100644 --- a/third-party/tbb/src/tbb/arena_slot.h +++ b/third-party/tbb/src/tbb/arena_slot.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -169,6 +169,11 @@ class arena_slot : private arena_slot_shared_state, private arena_slot_private_s return task_pool.load(std::memory_order_relaxed) != EmptyTaskPool; } + bool is_empty() const { + return task_pool.load(std::memory_order_relaxed) == EmptyTaskPool || + head.load(std::memory_order_relaxed) >= tail.load(std::memory_order_relaxed); + } + bool is_occupied() const { return my_is_occupied.load(std::memory_order_relaxed); } diff --git a/third-party/tbb/src/tbb/cancellation_disseminator.h b/third-party/tbb/src/tbb/cancellation_disseminator.h new file mode 100644 index 00000000..72445829 --- /dev/null +++ b/third-party/tbb/src/tbb/cancellation_disseminator.h @@ -0,0 +1,85 @@ +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_cancellation_disseminator_H +#define _TBB_cancellation_disseminator_H + +#include "oneapi/tbb/mutex.h" +#include "oneapi/tbb/task_group.h" + +#include "intrusive_list.h" +#include "thread_data.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class cancellation_disseminator { +public: + //! Finds all contexts affected by the state change and propagates the new state to them. + /* The propagation is relayed to the cancellation_disseminator because tasks created by one + external thread can be passed to and executed by other external threads. This means + that context trees can span several arenas at once and thus state change + propagation cannot be generally localized to one arena only. + */ + bool propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, d1::task_group_context& src, uint32_t new_state) { + if (src.my_may_have_children.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) { + return true; + } + + // The whole propagation algorithm is under the lock in order to ensure correctness + // in case of concurrent state changes at the different levels of the context tree. + threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex); + // TODO: consider to use double-check idiom + if ((src.*mptr_state).load(std::memory_order_relaxed) != new_state) { + // Another thread has concurrently changed the state. Back down. + return false; + } + + // Advance global state propagation epoch + ++the_context_state_propagation_epoch; + // Propagate to all workers and external threads and sync up their local epochs with the global one + // The whole propagation sequence is locked, thus no contention is expected + for (auto& thr_data : my_threads_list) { + thr_data.propagate_task_group_state(mptr_state, src, new_state); + } + + return true; + } + + void register_thread(thread_data& td) { + threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex); + my_threads_list.push_front(td); + } + + void unregister_thread(thread_data& td) { + threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex); + my_threads_list.remove(td); + } + +private: + using thread_data_list_type = intrusive_list; + using threads_list_mutex_type = d1::mutex; + + threads_list_mutex_type my_threads_list_mutex; + thread_data_list_type my_threads_list; +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_cancellation_disseminator_H diff --git a/third-party/tbb/src/tbb/concurrent_monitor.h b/third-party/tbb/src/tbb/concurrent_monitor.h index 3d20ef5b..3e5c4beb 100644 --- a/third-party/tbb/src/tbb/concurrent_monitor.h +++ b/third-party/tbb/src/tbb/concurrent_monitor.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -290,7 +290,17 @@ class concurrent_monitor_base { n = my_waitset.front(); if (n != end) { my_waitset.remove(*n); + +// GCC 12.x-13.x issues a warning here that to_wait_node(n)->my_is_in_list might have size 0, since n is +// a base_node pointer. (This cannot happen, because only wait_node pointers are added to my_waitset.) +#if (__TBB_GCC_VERSION >= 120100 && __TBB_GCC_VERSION < 140000 ) && !__clang__ && !__INTEL_COMPILER +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-overflow" +#endif to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed); +#if (__TBB_GCC_VERSION >= 120100 && __TBB_GCC_VERSION < 140000 ) && !__clang__ && !__INTEL_COMPILER +#pragma GCC diagnostic pop +#endif } } diff --git a/third-party/tbb/src/tbb/dynamic_link.cpp b/third-party/tbb/src/tbb/dynamic_link.cpp index 2d88f8bc..a21beb5a 100644 --- a/third-party/tbb/src/tbb/dynamic_link.cpp +++ b/third-party/tbb/src/tbb/dynamic_link.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2023 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -34,7 +34,8 @@ // Unify system calls #define dlopen( name, flags ) LoadLibrary( name ) #define dlsym( handle, name ) GetProcAddress( handle, name ) - #define dlclose( handle ) ( ! FreeLibrary( handle ) ) + // FreeLibrary return bool value that is not used. + #define dlclose( handle ) (void)( ! FreeLibrary( handle ) ) #define dlerror() GetLastError() #ifndef PATH_MAX #define PATH_MAX MAX_PATH diff --git a/third-party/tbb/src/tbb/global_control.cpp b/third-party/tbb/src/tbb/global_control.cpp index 1bc3c22c..127fc92d 100644 --- a/third-party/tbb/src/tbb/global_control.cpp +++ b/third-party/tbb/src/tbb/global_control.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,11 +17,13 @@ #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/detail/_template_helpers.h" +#include "oneapi/tbb/cache_aligned_allocator.h" #include "oneapi/tbb/global_control.h" #include "oneapi/tbb/tbb_allocator.h" #include "oneapi/tbb/spin_mutex.h" #include "governor.h" +#include "threading_control.h" #include "market.h" #include "misc.h" @@ -34,17 +36,21 @@ namespace r1 { //! Comparator for a set of global_control objects struct control_storage_comparator { - bool operator()(const global_control* lhs, const global_control* rhs) const; + bool operator()(const d1::global_control* lhs, const d1::global_control* rhs) const; }; class control_storage { friend struct global_control_impl; friend std::size_t global_control_active_value(int); + friend void global_control_lock(); + friend void global_control_unlock(); + friend std::size_t global_control_active_value_unsafe(d1::global_control::parameter); protected: std::size_t my_active_value{0}; - std::set> my_list{}; + std::set> my_list{}; spin_mutex my_list_mutex{}; public: + virtual ~control_storage() = default; virtual std::size_t default_value() const = 0; virtual void apply_active(std::size_t new_active) { my_active_value = new_active; @@ -56,6 +62,10 @@ class control_storage { spin_mutex::scoped_lock lock(my_list_mutex); // protect my_list.empty() call return !my_list.empty() ? my_active_value : default_value(); } + + std::size_t active_value_unsafe() { + return !my_list.empty() ? my_active_value : default_value(); + } }; class alignas(max_nfs_size) allowed_parallelism_control : public control_storage { @@ -67,23 +77,21 @@ class alignas(max_nfs_size) allowed_parallelism_control : public control_storage } void apply_active(std::size_t new_active) override { control_storage::apply_active(new_active); - __TBB_ASSERT( my_active_value>=1, nullptr); + __TBB_ASSERT(my_active_value >= 1, nullptr); // -1 to take external thread into account - market::set_active_num_workers( my_active_value-1 ); + threading_control::set_active_num_workers(my_active_value - 1); } std::size_t active_value() override { spin_mutex::scoped_lock lock(my_list_mutex); // protect my_list.empty() call - if (my_list.empty()) + if (my_list.empty()) { return default_value(); + } + // non-zero, if market is active - const std::size_t workers = market::max_num_workers(); + const std::size_t workers = threading_control::max_num_workers(); // We can't exceed market's maximal number of workers. // +1 to take external thread into account - return workers? min(workers+1, my_active_value): my_active_value; - } -public: - std::size_t active_value_if_present() const { - return !my_list.empty() ? my_active_value : 0; + return workers ? min(workers + 1, my_active_value) : my_active_value; } }; @@ -124,50 +132,57 @@ class alignas(max_nfs_size) lifetime_control : public control_storage { void apply_active(std::size_t new_active) override { if (new_active == 1) { // reserve the market reference - market::global_market_mutex_type::scoped_lock lock( market::theMarketMutex ); - if (market::theMarket) { - market::add_ref_unsafe(lock, /*is_public*/ true); - } + threading_control::register_lifetime_control(); } else if (new_active == 0) { // new_active == 0 - // release the market reference - market::global_market_mutex_type::scoped_lock lock( market::theMarketMutex ); - if (market::theMarket != nullptr) { - lock.release(); - market::theMarket->release(/*is_public*/ true, /*blocking_terminate*/ false); - } + threading_control::unregister_lifetime_control(/*blocking_terminate*/ false); } control_storage::apply_active(new_active); } +}; -public: - bool is_empty() { - spin_mutex::scoped_lock lock(my_list_mutex); - return my_list.empty(); +static control_storage* controls[] = {nullptr, nullptr, nullptr, nullptr}; + +void global_control_acquire() { + controls[0] = new (cache_aligned_allocate(sizeof(allowed_parallelism_control))) allowed_parallelism_control{}; + controls[1] = new (cache_aligned_allocate(sizeof(stack_size_control))) stack_size_control{}; + controls[2] = new (cache_aligned_allocate(sizeof(terminate_on_exception_control))) terminate_on_exception_control{}; + controls[3] = new (cache_aligned_allocate(sizeof(lifetime_control))) lifetime_control{}; +} + +void global_control_release() { + for (auto& ptr : controls) { + ptr->~control_storage(); + cache_aligned_deallocate(ptr); + ptr = nullptr; } -}; +} -static allowed_parallelism_control allowed_parallelism_ctl; -static stack_size_control stack_size_ctl; -static terminate_on_exception_control terminate_on_exception_ctl; -static lifetime_control lifetime_ctl; -static control_storage *controls[] = {&allowed_parallelism_ctl, &stack_size_ctl, &terminate_on_exception_ctl, &lifetime_ctl}; +void global_control_lock() { + for (auto& ctl : controls) { + ctl->my_list_mutex.lock(); + } +} -//! Comparator for a set of global_control objects -inline bool control_storage_comparator::operator()(const global_control* lhs, const global_control* rhs) const { - __TBB_ASSERT_RELEASE(lhs->my_param < global_control::parameter_max , nullptr); - return lhs->my_value < rhs->my_value || (lhs->my_value == rhs->my_value && lhs < rhs); +void global_control_unlock() { + int N = std::distance(std::begin(controls), std::end(controls)); + for (int i = N - 1; i >= 0; --i) { + controls[i]->my_list_mutex.unlock(); + } } -unsigned market::app_parallelism_limit() { - return allowed_parallelism_ctl.active_value_if_present(); +std::size_t global_control_active_value_unsafe(d1::global_control::parameter param) { + __TBB_ASSERT_RELEASE(param < d1::global_control::parameter_max, nullptr); + return controls[param]->active_value_unsafe(); } -bool terminate_on_exception() { - return global_control::active_value(global_control::terminate_on_exception) == 1; +//! Comparator for a set of global_control objects +inline bool control_storage_comparator::operator()(const d1::global_control* lhs, const d1::global_control* rhs) const { + __TBB_ASSERT_RELEASE(lhs->my_param < d1::global_control::parameter_max , nullptr); + return lhs->my_value < rhs->my_value || (lhs->my_value == rhs->my_value && lhs < rhs); } -unsigned market::is_lifetime_control_present() { - return !lifetime_ctl.is_empty(); +bool terminate_on_exception() { + return d1::global_control::active_value(d1::global_control::terminate_on_exception) == 1; } struct global_control_impl { @@ -184,7 +199,7 @@ struct global_control_impl { public: static void create(d1::global_control& gc) { - __TBB_ASSERT_RELEASE(gc.my_param < global_control::parameter_max, nullptr); + __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr); control_storage* const c = controls[gc.my_param]; spin_mutex::scoped_lock lock(c->my_list_mutex); @@ -197,15 +212,15 @@ struct global_control_impl { } static void destroy(d1::global_control& gc) { - __TBB_ASSERT_RELEASE(gc.my_param < global_control::parameter_max, nullptr); + __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr); control_storage* const c = controls[gc.my_param]; // Concurrent reading and changing global parameter is possible. spin_mutex::scoped_lock lock(c->my_list_mutex); - __TBB_ASSERT(gc.my_param == global_control::scheduler_handle || !c->my_list.empty(), nullptr); + __TBB_ASSERT(gc.my_param == d1::global_control::scheduler_handle || !c->my_list.empty(), nullptr); std::size_t new_active = (std::size_t)(-1), old_active = c->my_active_value; if (!erase_if_present(c, gc)) { - __TBB_ASSERT(gc.my_param == global_control::scheduler_handle , nullptr); + __TBB_ASSERT(gc.my_param == d1::global_control::scheduler_handle , nullptr); return; } if (c->my_list.empty()) { @@ -220,7 +235,7 @@ struct global_control_impl { } static bool remove_and_check_if_empty(d1::global_control& gc) { - __TBB_ASSERT_RELEASE(gc.my_param < global_control::parameter_max, nullptr); + __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr); control_storage* const c = controls[gc.my_param]; spin_mutex::scoped_lock lock(c->my_list_mutex); @@ -230,7 +245,7 @@ struct global_control_impl { } #if TBB_USE_ASSERT static bool is_present(d1::global_control& gc) { - __TBB_ASSERT_RELEASE(gc.my_param < global_control::parameter_max, nullptr); + __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr); control_storage* const c = controls[gc.my_param]; spin_mutex::scoped_lock lock(c->my_list_mutex); @@ -259,7 +274,7 @@ bool is_present(d1::global_control& gc) { } #endif // TBB_USE_ASSERT std::size_t __TBB_EXPORTED_FUNC global_control_active_value(int param) { - __TBB_ASSERT_RELEASE(param < global_control::parameter_max, nullptr); + __TBB_ASSERT_RELEASE(param < d1::global_control::parameter_max, nullptr); return controls[param]->active_value(); } diff --git a/third-party/tbb/src/tbb/governor.cpp b/third-party/tbb/src/tbb/governor.cpp index 3111ab3e..55175196 100644 --- a/third-party/tbb/src/tbb/governor.cpp +++ b/third-party/tbb/src/tbb/governor.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,12 +15,14 @@ */ #include "governor.h" +#include "threading_control.h" #include "main.h" #include "thread_data.h" #include "market.h" #include "arena.h" #include "dynamic_link.h" #include "concurrent_monitor.h" +#include "thread_dispatcher.h" #include "oneapi/tbb/task_group.h" #include "oneapi/tbb/global_control.h" @@ -40,6 +42,8 @@ namespace detail { namespace r1 { void clear_address_waiter_table(); +void global_control_acquire(); +void global_control_release(); //! global_control.cpp contains definition bool remove_and_check_if_empty(d1::global_control& gc); @@ -58,6 +62,7 @@ namespace system_topology { //------------------------------------------------------------------------ void governor::acquire_resources () { + global_control_acquire(); #if __TBB_USE_POSIX int status = theTLS.create(auto_terminate); #else @@ -83,6 +88,7 @@ void governor::release_resources () { system_topology::destroy(); dynamic_unlink_all(); + global_control_release(); } rml::tbb_server* governor::create_rml_server ( rml::tbb_client& client ) { @@ -108,6 +114,10 @@ void governor::one_time_init() { } } +bool governor::does_client_join_workers(const rml::tbb_client &client) { + return ((const thread_dispatcher&)client).must_join_workers(); +} + /* There is no portable way to get stack base address in Posix, however the modern Linux versions provide pthread_attr_np API that can be used to obtain thread's @@ -185,21 +195,20 @@ void governor::init_external_thread() { int num_reserved_slots = 1; unsigned arena_priority_level = 1; // corresponds to tbb::task_arena::priority::normal std::size_t stack_size = 0; - arena& a = *market::create_arena(num_slots, num_reserved_slots, arena_priority_level, stack_size); - // We need an internal reference to the market. TODO: is it legacy? - market::global_market(false); + threading_control* thr_control = threading_control::register_public_reference(); + arena& a = arena::create(thr_control, num_slots, num_reserved_slots, arena_priority_level); // External thread always occupies the first slot thread_data& td = *new(cache_aligned_allocate(sizeof(thread_data))) thread_data(0, false); td.attach_arena(a, /*slot index*/ 0); __TBB_ASSERT(td.my_inbox.is_idle_state(false), nullptr); - stack_size = a.my_market->worker_stack_size(); + stack_size = a.my_threading_control->worker_stack_size(); std::uintptr_t stack_base = get_stack_base(stack_size); task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher(); td.enter_task_dispatcher(task_disp, calculate_stealing_threshold(stack_base, stack_size)); td.my_arena_slot->occupy(); - a.my_market->add_external_thread(td); + thr_control->register_thread(td); set_thread_data(td); #if (_WIN32||_WIN64) && !__TBB_DYNAMIC_LOAD_ENABLED // The external thread destructor is called from dllMain but it is not available with a static build. @@ -223,7 +232,7 @@ void governor::auto_terminate(void* tls) { // Only external thread can be inside an arena during termination. if (td->my_arena_slot) { arena* a = td->my_arena; - market* m = a->my_market; + threading_control* thr_control = a->my_threading_control; // If the TLS slot is already cleared by OS or underlying concurrency // runtime, restore its value to properly clean up arena @@ -236,16 +245,16 @@ void governor::auto_terminate(void* tls) { td->leave_task_dispatcher(); td->my_arena_slot->release(); // Release an arena - a->on_thread_leaving(); + a->on_thread_leaving(arena::ref_external); - m->remove_external_thread(*td); + thr_control->unregister_thread(*td); // The tls should be cleared before market::release because // market can destroy the tls key if we keep the last reference clear_tls(); // If there was an associated arena, it added a public market reference - m->release( /*is_public*/ true, /*blocking_terminate*/ false); + thr_control->unregister_public_reference(/* blocking terminate =*/ false); } else { clear_tls(); } @@ -272,12 +281,10 @@ void release_impl(d1::task_scheduler_handle& handle) { bool finalize_impl(d1::task_scheduler_handle& handle) { __TBB_ASSERT_RELEASE(handle, "trying to finalize with null handle"); - market::global_market_mutex_type::scoped_lock lock( market::theMarketMutex ); - bool ok = true; // ok if theMarket does not exist yet - market* m = market::theMarket; // read the state of theMarket - if (m != nullptr) { - lock.release(); - __TBB_ASSERT(is_present(*handle.m_ctl), "finalize or release was already called on this object"); + __TBB_ASSERT(is_present(*handle.m_ctl), "finalize or release was already called on this object"); + + bool ok = true; // ok if threading_control does not exist yet + if (threading_control::is_present()) { thread_data* td = governor::get_thread_data_if_initialized(); if (td) { task_dispatcher* task_disp = td->my_task_dispatcher; @@ -286,12 +293,14 @@ bool finalize_impl(d1::task_scheduler_handle& handle) { governor::auto_terminate(td); } } + if (remove_and_check_if_empty(*handle.m_ctl)) { - ok = m->release(/*is_public*/ true, /*blocking_terminate*/ true); + ok = threading_control::unregister_lifetime_control(/*blocking_terminate*/ true); } else { ok = false; } } + return ok; } @@ -367,15 +376,18 @@ static void (*restore_affinity_ptr)( binding_handler* handler_ptr, int slot_num int (*get_default_concurrency_ptr)( int numa_id, int core_type_id, int max_threads_per_core ) = dummy_get_default_concurrency; -#if _WIN32 || _WIN64 || __unix__ +#if _WIN32 || _WIN64 || __unix__ || __APPLE__ + // Table describing how to link the handlers. static const dynamic_link_descriptor TbbBindLinkTable[] = { DLD(__TBB_internal_initialize_system_topology, initialize_system_topology_ptr), DLD(__TBB_internal_destroy_system_topology, destroy_system_topology_ptr), +#if __TBB_CPUBIND_PRESENT DLD(__TBB_internal_allocate_binding_handler, allocate_binding_handler_ptr), DLD(__TBB_internal_deallocate_binding_handler, deallocate_binding_handler_ptr), DLD(__TBB_internal_apply_affinity, apply_affinity_ptr), DLD(__TBB_internal_restore_affinity, restore_affinity_ptr), +#endif DLD(__TBB_internal_get_default_concurrency, get_default_concurrency_ptr) }; @@ -390,6 +402,9 @@ static const unsigned LinkTableSize = sizeof(TbbBindLinkTable) / sizeof(dynamic_ #if _WIN32 || _WIN64 #define LIBRARY_EXTENSION ".dll" #define LIBRARY_PREFIX +#elif __APPLE__ +#define LIBRARY_EXTENSION __TBB_STRING(.3.dylib) +#define LIBRARY_PREFIX "lib" #elif __unix__ #define LIBRARY_EXTENSION __TBB_STRING(.so.3) #define LIBRARY_PREFIX "lib" @@ -418,7 +433,7 @@ int core_types_count = 0; int* core_types_indexes = nullptr; const char* load_tbbbind_shared_object() { -#if _WIN32 || _WIN64 || __unix__ +#if _WIN32 || _WIN64 || __unix__ || __APPLE__ #if _WIN32 && !_WIN64 // For 32-bit Windows applications, process affinity masks can only support up to 32 logical CPUs. SYSTEM_INFO si; @@ -430,7 +445,7 @@ const char* load_tbbbind_shared_object() { return tbbbind_version; } } -#endif /* _WIN32 || _WIN64 || __unix__ */ +#endif /* _WIN32 || _WIN64 || __unix__ || __APPLE__ */ return nullptr; } diff --git a/third-party/tbb/src/tbb/governor.h b/third-party/tbb/src/tbb/governor.h index 3d861e53..573443d7 100644 --- a/third-party/tbb/src/tbb/governor.h +++ b/third-party/tbb/src/tbb/governor.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -47,7 +47,8 @@ typedef std::size_t stack_size_type; class governor { private: friend class __TBB_InitOnce; - friend class market; + friend class thread_dispatcher; + friend class threading_control_impl; // TODO: consider using thread_local (measure performance and side effects) //! TLS for scheduler instances associated with individual threads @@ -137,6 +138,8 @@ class governor { static bool wait_package_enabled() { return cpu_features.waitpkg_enabled; } #endif + static bool hybrid_cpu() { return cpu_features.hybrid; } + static bool rethrow_exception_broken() { return is_rethrow_broken; } static bool is_itt_present() { diff --git a/third-party/tbb/src/tbb/main.cpp b/third-party/tbb/src/tbb/main.cpp index 8a1dc893..85e759e2 100644 --- a/third-party/tbb/src/tbb/main.cpp +++ b/third-party/tbb/src/tbb/main.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,8 +18,10 @@ #include "main.h" #include "governor.h" +#include "threading_control.h" #include "environment.h" #include "market.h" +#include "tcm_adaptor.h" #include "misc.h" #include "itt_notify.h" @@ -40,9 +42,9 @@ bool governor::UsePrivateRML; bool governor::is_rethrow_broken; //------------------------------------------------------------------------ -// market data -market* market::theMarket; -market::global_market_mutex_type market::theMarketMutex; +// threading_control data +threading_control* threading_control::g_threading_control; +threading_control::global_mutex_type threading_control::g_threading_control_mutex; //------------------------------------------------------------------------ // context propagation data @@ -90,8 +92,10 @@ static check_observer_proxy_count the_check_observer_proxy_count; //------------------------------------------------------------------------ void __TBB_InitOnce::add_ref() { - if( ++count==1 ) + if (++count == 1) { governor::acquire_resources(); + tcm_adaptor::initialize(); + } } void __TBB_InitOnce::remove_ref() { @@ -117,8 +121,10 @@ void DoOneTimeInitialization() { // No fence required for load of InitializationDone, because we are inside a critical section. if( !__TBB_InitOnce::InitializationDone ) { __TBB_InitOnce::add_ref(); - if( GetBoolEnvironmentVariable("TBB_VERSION") ) + if( GetBoolEnvironmentVariable("TBB_VERSION") ) { PrintVersion(); + tcm_adaptor::print_version(); + } bool itt_present = false; #if __TBB_USE_ITT_NOTIFY ITT_DoUnsafeOneTimeInitialization(); diff --git a/third-party/tbb/src/tbb/market.cpp b/third-party/tbb/src/tbb/market.cpp index b6504e0f..ae3fadd4 100644 --- a/third-party/tbb/src/tbb/market.cpp +++ b/third-party/tbb/src/tbb/market.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,380 +14,65 @@ limitations under the License. */ -#include "oneapi/tbb/global_control.h" // global_control::active_value - -#include "market.h" -#include "main.h" -#include "governor.h" #include "arena.h" -#include "thread_data.h" -#include "itt_notify.h" +#include "market.h" -#include // std::memset() +#include // std::find namespace tbb { namespace detail { namespace r1 { -/** This method must be invoked under my_arenas_list_mutex. **/ -arena* market::select_next_arena( arena* hint ) { - unsigned next_arena_priority_level = num_priority_levels; - if ( hint ) - next_arena_priority_level = hint->my_priority_level; - for ( unsigned idx = 0; idx < next_arena_priority_level; ++idx ) { - if ( !my_arenas[idx].empty() ) - return &*my_arenas[idx].begin(); - } - // don't change if arena with higher priority is not found. - return hint; -} -void market::insert_arena_into_list ( arena& a ) { - __TBB_ASSERT( a.my_priority_level < num_priority_levels, nullptr ); - my_arenas[a.my_priority_level].push_front( a ); - __TBB_ASSERT( !my_next_arena || my_next_arena->my_priority_level < num_priority_levels, nullptr ); - my_next_arena = select_next_arena( my_next_arena ); -} +class tbb_permit_manager_client : public pm_client { +public: + tbb_permit_manager_client(arena& a) : pm_client(a) {} -void market::remove_arena_from_list ( arena& a ) { - __TBB_ASSERT( a.my_priority_level < num_priority_levels, nullptr ); - my_arenas[a.my_priority_level].remove( a ); - if ( my_next_arena == &a ) - my_next_arena = nullptr; - my_next_arena = select_next_arena( my_next_arena ); -} + void register_thread() override {} + + void unregister_thread() override {} + + void set_allotment(unsigned allotment) { + my_arena.set_allotment(allotment); + } +}; //------------------------------------------------------------------------ // market //------------------------------------------------------------------------ -market::market ( unsigned workers_soft_limit, unsigned workers_hard_limit, std::size_t stack_size ) - : my_num_workers_hard_limit(workers_hard_limit) - , my_num_workers_soft_limit(workers_soft_limit) - , my_next_arena(nullptr) - , my_ref_count(1) - , my_stack_size(stack_size) - , my_workers_soft_limit_to_report(workers_soft_limit) -{ - // Once created RML server will start initializing workers that will need - // global market instance to get worker stack size - my_server = governor::create_rml_server( *this ); - __TBB_ASSERT( my_server, "Failed to create RML server" ); -} +market::market(unsigned workers_soft_limit) + : my_num_workers_soft_limit(workers_soft_limit) +{} -market::~market() { - poison_pointer(my_server); - poison_pointer(my_next_arena); +pm_client* market::create_client(arena& a) { + return new (cache_aligned_allocate(sizeof(tbb_permit_manager_client))) tbb_permit_manager_client(a); } -static unsigned calc_workers_soft_limit(unsigned workers_soft_limit, unsigned workers_hard_limit) { - if( int soft_limit = market::app_parallelism_limit() ) - workers_soft_limit = soft_limit-1; - else // if user set no limits (yet), use market's parameter - workers_soft_limit = max( governor::default_num_threads() - 1, workers_soft_limit ); - if( workers_soft_limit >= workers_hard_limit ) - workers_soft_limit = workers_hard_limit-1; - return workers_soft_limit; +void market::register_client(pm_client* c, d1::constraints&) { + mutex_type::scoped_lock lock(my_mutex); + my_clients[c->priority_level()].push_back(c); } -bool market::add_ref_unsafe( global_market_mutex_type::scoped_lock& lock, bool is_public, unsigned workers_requested, std::size_t stack_size ) { - market *m = theMarket; - if( m ) { - ++m->my_ref_count; - const unsigned old_public_count = is_public ? m->my_public_ref_count++ : /*any non-zero value*/1; - lock.release(); - if( old_public_count==0 ) - set_active_num_workers( calc_workers_soft_limit(workers_requested, m->my_num_workers_hard_limit) ); - - // do not warn if default number of workers is requested - if( workers_requested != governor::default_num_threads()-1 ) { - __TBB_ASSERT( skip_soft_limit_warning > workers_requested, - "skip_soft_limit_warning must be larger than any valid workers_requested" ); - unsigned soft_limit_to_report = m->my_workers_soft_limit_to_report.load(std::memory_order_relaxed); - if( soft_limit_to_report < workers_requested ) { - runtime_warning( "The number of workers is currently limited to %u. " - "The request for %u workers is ignored. Further requests for more workers " - "will be silently ignored until the limit changes.\n", - soft_limit_to_report, workers_requested ); - // The race is possible when multiple threads report warnings. - // We are OK with that, as there are just multiple warnings. - unsigned expected_limit = soft_limit_to_report; - m->my_workers_soft_limit_to_report.compare_exchange_strong(expected_limit, skip_soft_limit_warning); - } - - } - if( m->my_stack_size < stack_size ) - runtime_warning( "Thread stack size has been already set to %u. " - "The request for larger stack (%u) cannot be satisfied.\n", m->my_stack_size, stack_size ); - return true; - } - return false; -} - -market& market::global_market(bool is_public, unsigned workers_requested, std::size_t stack_size) { - global_market_mutex_type::scoped_lock lock( theMarketMutex ); - if( !market::add_ref_unsafe(lock, is_public, workers_requested, stack_size) ) { - // TODO: A lot is done under theMarketMutex locked. Can anything be moved out? - if( stack_size == 0 ) - stack_size = global_control::active_value(global_control::thread_stack_size); - // Expecting that 4P is suitable for most applications. - // Limit to 2P for large thread number. - // TODO: ask RML for max concurrency and possibly correct hard_limit - const unsigned factor = governor::default_num_threads()<=128? 4 : 2; - // The requested number of threads is intentionally not considered in - // computation of the hard limit, in order to separate responsibilities - // and avoid complicated interactions between global_control and task_scheduler_init. - // The market guarantees that at least 256 threads might be created. - const unsigned workers_hard_limit = max(max(factor*governor::default_num_threads(), 256u), app_parallelism_limit()); - const unsigned workers_soft_limit = calc_workers_soft_limit(workers_requested, workers_hard_limit); - // Create the global market instance - std::size_t size = sizeof(market); - __TBB_ASSERT( __TBB_offsetof(market, my_workers) + sizeof(std::atomic) == sizeof(market), - "my_workers must be the last data field of the market class"); - size += sizeof(std::atomic) * (workers_hard_limit - 1); - __TBB_InitOnce::add_ref(); - void* storage = cache_aligned_allocate(size); - std::memset( storage, 0, size ); - // Initialize and publish global market - market* m = new (storage) market( workers_soft_limit, workers_hard_limit, stack_size ); - if( is_public ) - m->my_public_ref_count.store(1, std::memory_order_relaxed); - if (market::is_lifetime_control_present()) { - ++m->my_public_ref_count; - ++m->my_ref_count; - } - theMarket = m; - // This check relies on the fact that for shared RML default_concurrency==max_concurrency - if ( !governor::UsePrivateRML && m->my_server->default_concurrency() < workers_soft_limit ) - runtime_warning( "RML might limit the number of workers to %u while %u is requested.\n" - , m->my_server->default_concurrency(), workers_soft_limit ); - } - return *theMarket; -} - -void market::destroy () { - this->market::~market(); // qualified to suppress warning - cache_aligned_deallocate( this ); - __TBB_InitOnce::remove_ref(); -} - -bool market::release ( bool is_public, bool blocking_terminate ) { - market::enforce([this] { return theMarket == this; }, "Global market instance was destroyed prematurely?"); - bool do_release = false; +void market::unregister_and_destroy_client(pm_client& c) { { - global_market_mutex_type::scoped_lock lock( theMarketMutex ); - if ( blocking_terminate ) { - __TBB_ASSERT( is_public, "Only an object with a public reference can request the blocking terminate" ); - while ( my_public_ref_count.load(std::memory_order_relaxed) == 1 && - my_ref_count.load(std::memory_order_relaxed) > 1 ) { - lock.release(); - // To guarantee that request_close_connection() is called by the last external thread, we need to wait till all - // references are released. Re-read my_public_ref_count to limit waiting if new external threads are created. - // Theoretically, new private references to the market can be added during waiting making it potentially - // endless. - // TODO: revise why the weak scheduler needs market's pointer and try to remove this wait. - // Note that the market should know about its schedulers for cancellation/exception/priority propagation, - // see e.g. task_group_context::cancel_group_execution() - while ( my_public_ref_count.load(std::memory_order_acquire) == 1 && - my_ref_count.load(std::memory_order_acquire) > 1 ) { - yield(); - } - lock.acquire( theMarketMutex ); - } - } - if ( is_public ) { - __TBB_ASSERT( theMarket == this, "Global market instance was destroyed prematurely?" ); - __TBB_ASSERT( my_public_ref_count.load(std::memory_order_relaxed), nullptr); - --my_public_ref_count; - } - if ( --my_ref_count == 0 ) { - __TBB_ASSERT( !my_public_ref_count.load(std::memory_order_relaxed), nullptr); - do_release = true; - theMarket = nullptr; - } - } - if( do_release ) { - __TBB_ASSERT( !my_public_ref_count.load(std::memory_order_relaxed), - "No public references remain if we remove the market." ); - // inform RML that blocking termination is required - my_join_workers = blocking_terminate; - my_server->request_close_connection(); - return blocking_terminate; + mutex_type::scoped_lock lock(my_mutex); + auto& clients = my_clients[c.priority_level()]; + auto it = std::find(clients.begin(), clients.end(), &c); + __TBB_ASSERT(it != clients.end(), "Destroying of an unregistered client"); + clients.erase(it); } - return false; -} - -int market::update_workers_request() { - int old_request = my_num_workers_requested; - my_num_workers_requested = min(my_total_demand.load(std::memory_order_relaxed), - (int)my_num_workers_soft_limit.load(std::memory_order_relaxed)); -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - if (my_mandatory_num_requested > 0) { - __TBB_ASSERT(my_num_workers_soft_limit.load(std::memory_order_relaxed) == 0, nullptr); - my_num_workers_requested = 1; - } -#endif - update_allotment(my_num_workers_requested); - return my_num_workers_requested - old_request; -} - -void market::set_active_num_workers ( unsigned soft_limit ) { - market *m; - - { - global_market_mutex_type::scoped_lock lock( theMarketMutex ); - if ( !theMarket ) - return; // actual value will be used at market creation - m = theMarket; - if (m->my_num_workers_soft_limit.load(std::memory_order_relaxed) == soft_limit) - return; - ++m->my_ref_count; - } - // have my_ref_count for market, use it safely - - int delta = 0; - { - arenas_list_mutex_type::scoped_lock lock( m->my_arenas_list_mutex ); - __TBB_ASSERT(soft_limit <= m->my_num_workers_hard_limit, nullptr); - -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - arena_list_type* arenas = m->my_arenas; - - if (m->my_num_workers_soft_limit.load(std::memory_order_relaxed) == 0 && - m->my_mandatory_num_requested > 0) - { - for (unsigned level = 0; level < num_priority_levels; ++level ) - for (arena_list_type::iterator it = arenas[level].begin(); it != arenas[level].end(); ++it) - if (it->my_global_concurrency_mode.load(std::memory_order_relaxed)) - m->disable_mandatory_concurrency_impl(&*it); - } - __TBB_ASSERT(m->my_mandatory_num_requested == 0, nullptr); -#endif - - m->my_num_workers_soft_limit.store(soft_limit, std::memory_order_release); - // report only once after new soft limit value is set - m->my_workers_soft_limit_to_report.store(soft_limit, std::memory_order_relaxed); -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - if (m->my_num_workers_soft_limit.load(std::memory_order_relaxed) == 0) { - for (unsigned level = 0; level < num_priority_levels; ++level ) - for (arena_list_type::iterator it = arenas[level].begin(); it != arenas[level].end(); ++it) - if (it->has_enqueued_tasks()) - m->enable_mandatory_concurrency_impl(&*it); - } -#endif - - delta = m->update_workers_request(); - } - // adjust_job_count_estimate must be called outside of any locks - if( delta!=0 ) - m->my_server->adjust_job_count_estimate( delta ); - // release internal market reference to match ++m->my_ref_count above - m->release( /*is_public=*/false, /*blocking_terminate=*/false ); -} - -bool governor::does_client_join_workers (const rml::tbb_client &client) { - return ((const market&)client).must_join_workers(); -} - -arena* market::create_arena ( int num_slots, int num_reserved_slots, unsigned arena_priority_level, - std::size_t stack_size ) -{ - __TBB_ASSERT( num_slots > 0, nullptr); - __TBB_ASSERT( num_reserved_slots <= num_slots, nullptr); - // Add public market reference for an external thread/task_arena (that adds an internal reference in exchange). - market &m = global_market( /*is_public=*/true, num_slots-num_reserved_slots, stack_size ); - arena& a = arena::allocate_arena( m, num_slots, num_reserved_slots, arena_priority_level ); - // Add newly created arena into the existing market's list. - arenas_list_mutex_type::scoped_lock lock(m.my_arenas_list_mutex); - m.insert_arena_into_list(a); - return &a; + auto client = static_cast(&c); + client->~tbb_permit_manager_client(); + cache_aligned_deallocate(client); } -/** This method must be invoked under my_arenas_list_mutex. **/ -void market::detach_arena ( arena& a ) { - market::enforce([this] { return theMarket == this; }, "Global market instance was destroyed prematurely?"); - __TBB_ASSERT( !a.my_slots[0].is_occupied(), nullptr); - if (a.my_global_concurrency_mode.load(std::memory_order_relaxed)) - disable_mandatory_concurrency_impl(&a); - - remove_arena_from_list(a); - if (a.my_aba_epoch == my_arenas_aba_epoch.load(std::memory_order_relaxed)) { - my_arenas_aba_epoch.store(my_arenas_aba_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); - } -} - -void market::try_destroy_arena ( arena* a, uintptr_t aba_epoch, unsigned priority_level ) { - bool locked = true; - __TBB_ASSERT( a, nullptr); - // we hold reference to the server, so market cannot be destroyed at any moment here - __TBB_ASSERT(!is_poisoned(my_server), nullptr); - my_arenas_list_mutex.lock(); - arena_list_type::iterator it = my_arenas[priority_level].begin(); - for ( ; it != my_arenas[priority_level].end(); ++it ) { - if ( a == &*it ) { - if ( it->my_aba_epoch == aba_epoch ) { - // Arena is alive - // Acquire my_references to sync with threads that just left the arena - if (!a->my_num_workers_requested && !a->my_references.load(std::memory_order_acquire)) { - __TBB_ASSERT( - !a->my_num_workers_allotted.load(std::memory_order_relaxed) && - (a->my_pool_state == arena::SNAPSHOT_EMPTY || !a->my_max_num_workers), - "Inconsistent arena state" - ); - // Arena is abandoned. Destroy it. - detach_arena( *a ); - my_arenas_list_mutex.unlock(); - locked = false; - a->free_arena(); - } - } - if (locked) - my_arenas_list_mutex.unlock(); - return; - } - } - my_arenas_list_mutex.unlock(); -} +void market::update_allotment() { + int effective_soft_limit = my_mandatory_num_requested > 0 && my_num_workers_soft_limit == 0 ? 1 : my_num_workers_soft_limit; + int max_workers = min(my_total_demand, effective_soft_limit); + __TBB_ASSERT(max_workers >= 0, nullptr); -/** This method must be invoked under my_arenas_list_mutex. **/ -arena* market::arena_in_need ( arena_list_type* arenas, arena* hint ) { - // TODO: make sure arena with higher priority returned only if there are available slots in it. - hint = select_next_arena( hint ); - if ( !hint ) - return nullptr; - arena_list_type::iterator it = hint; - unsigned curr_priority_level = hint->my_priority_level; - __TBB_ASSERT( it != arenas[curr_priority_level].end(), nullptr ); - do { - arena& a = *it; - if ( ++it == arenas[curr_priority_level].end() ) { - do { - ++curr_priority_level %= num_priority_levels; - } while ( arenas[curr_priority_level].empty() ); - it = arenas[curr_priority_level].begin(); - } - if( a.num_workers_active() < a.my_num_workers_allotted.load(std::memory_order_relaxed) ) { - a.my_references += arena::ref_worker; - return &a; - } - } while ( it != hint ); - return nullptr; -} - -arena* market::arena_in_need(arena* prev) { - if (my_total_demand.load(std::memory_order_acquire) <= 0) - return nullptr; - arenas_list_mutex_type::scoped_lock lock(my_arenas_list_mutex, /*is_writer=*/false); - // TODO: introduce three state response: alive, not_alive, no_market_arenas - if ( is_arena_alive(prev) ) - return arena_in_need(my_arenas, prev); - return arena_in_need(my_arenas, my_next_arena); -} - -int market::update_allotment ( arena_list_type* arenas, int workers_demand, int max_workers ) { - __TBB_ASSERT( workers_demand > 0, nullptr ); - max_workers = min(workers_demand, max_workers); int unassigned_workers = max_workers; int assigned = 0; int carry = 0; @@ -395,13 +80,11 @@ int market::update_allotment ( arena_list_type* arenas, int workers_demand, int for (unsigned list_idx = 0; list_idx < num_priority_levels; ++list_idx ) { int assigned_per_priority = min(my_priority_level_demand[list_idx], unassigned_workers); unassigned_workers -= assigned_per_priority; - for (arena_list_type::iterator it = arenas[list_idx].begin(); it != arenas[list_idx].end(); ++it) { - arena& a = *it; - __TBB_ASSERT(a.my_num_workers_requested >= 0, nullptr); - __TBB_ASSERT(a.my_num_workers_requested <= int(a.my_max_num_workers) - || (a.my_max_num_workers == 0 && a.my_local_concurrency_requests > 0 && a.my_num_workers_requested == 1), nullptr); - if (a.my_num_workers_requested == 0) { - __TBB_ASSERT(!a.my_num_workers_allotted.load(std::memory_order_relaxed), nullptr); + // We use reverse iterator there to serve last added clients first + for (auto it = my_clients[list_idx].rbegin(); it != my_clients[list_idx].rend(); ++it) { + tbb_permit_manager_client& client = static_cast(**it); + if (client.max_workers() == 0) { + client.set_allotment(0); continue; } @@ -410,233 +93,49 @@ int market::update_allotment ( arena_list_type* arenas, int workers_demand, int } int allotted = 0; -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - if (my_num_workers_soft_limit.load(std::memory_order_relaxed) == 0) { + if (my_num_workers_soft_limit == 0) { __TBB_ASSERT(max_workers == 0 || max_workers == 1, nullptr); - allotted = a.my_global_concurrency_mode.load(std::memory_order_relaxed) && - assigned < max_workers ? 1 : 0; - } else -#endif - { - int tmp = a.my_num_workers_requested * assigned_per_priority + carry; + allotted = client.min_workers() > 0 && assigned < max_workers ? 1 : 0; + } else { + int tmp = client.max_workers() * assigned_per_priority + carry; allotted = tmp / my_priority_level_demand[list_idx]; carry = tmp % my_priority_level_demand[list_idx]; - __TBB_ASSERT(allotted <= a.my_num_workers_requested, nullptr); - __TBB_ASSERT(allotted <= int(a.my_num_slots - a.my_num_reserved_slots), nullptr); + __TBB_ASSERT(allotted <= client.max_workers(), nullptr); } - a.my_num_workers_allotted.store(allotted, std::memory_order_relaxed); - a.my_is_top_priority.store(list_idx == max_priority_level, std::memory_order_relaxed); + client.set_allotment(allotted); + client.set_top_priority(list_idx == max_priority_level); assigned += allotted; } } - __TBB_ASSERT( 0 <= assigned && assigned <= max_workers, nullptr ); - return assigned; -} - -/** This method must be invoked under my_arenas_list_mutex. **/ -bool market::is_arena_in_list( arena_list_type &arenas, arena *a ) { - __TBB_ASSERT( a, "Expected non-null pointer to arena." ); - for ( arena_list_type::iterator it = arenas.begin(); it != arenas.end(); ++it ) - if ( a == &*it ) - return true; - return false; -} - -/** This method must be invoked under my_arenas_list_mutex. **/ -bool market::is_arena_alive(arena* a) { - if ( !a ) - return false; - - // Still cannot access internals of the arena since the object itself might be destroyed. - - for ( unsigned idx = 0; idx < num_priority_levels; ++idx ) { - if ( is_arena_in_list( my_arenas[idx], a ) ) - return true; - } - return false; -} - -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY -void market::enable_mandatory_concurrency_impl ( arena *a ) { - __TBB_ASSERT(!a->my_global_concurrency_mode.load(std::memory_order_relaxed), nullptr); - __TBB_ASSERT(my_num_workers_soft_limit.load(std::memory_order_relaxed) == 0, nullptr); - - a->my_global_concurrency_mode.store(true, std::memory_order_relaxed); - my_mandatory_num_requested++; + __TBB_ASSERT(assigned == max_workers, nullptr); } -void market::enable_mandatory_concurrency ( arena *a ) { - int delta = 0; - { - arenas_list_mutex_type::scoped_lock lock(my_arenas_list_mutex); - if (my_num_workers_soft_limit.load(std::memory_order_relaxed) != 0 || - a->my_global_concurrency_mode.load(std::memory_order_relaxed)) - return; - - enable_mandatory_concurrency_impl(a); - delta = update_workers_request(); +void market::set_active_num_workers(int soft_limit) { + mutex_type::scoped_lock lock(my_mutex); + if (my_num_workers_soft_limit != soft_limit) { + my_num_workers_soft_limit = soft_limit; + update_allotment(); } - - if (delta != 0) - my_server->adjust_job_count_estimate(delta); -} - -void market::disable_mandatory_concurrency_impl(arena* a) { - __TBB_ASSERT(a->my_global_concurrency_mode.load(std::memory_order_relaxed), nullptr); - __TBB_ASSERT(my_mandatory_num_requested > 0, nullptr); - - a->my_global_concurrency_mode.store(false, std::memory_order_relaxed); - my_mandatory_num_requested--; } -void market::mandatory_concurrency_disable ( arena *a ) { - int delta = 0; - { - arenas_list_mutex_type::scoped_lock lock(my_arenas_list_mutex); - if (!a->my_global_concurrency_mode.load(std::memory_order_relaxed)) - return; - // There is a racy window in advertise_new_work between mandtory concurrency enabling and - // setting SNAPSHOT_FULL. It gives a chance to spawn request to disable mandatory concurrency. - // Therefore, we double check that there is no enqueued tasks. - if (a->has_enqueued_tasks()) - return; - - __TBB_ASSERT(my_num_workers_soft_limit.load(std::memory_order_relaxed) == 0, nullptr); - disable_mandatory_concurrency_impl(a); +void market::adjust_demand(pm_client& c, int mandatory_delta, int workers_delta) { + __TBB_ASSERT(-1 <= mandatory_delta && mandatory_delta <= 1, nullptr); - delta = update_workers_request(); - } - if (delta != 0) - my_server->adjust_job_count_estimate(delta); -} -#endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY */ - -void market::adjust_demand ( arena& a, int delta, bool mandatory ) { - if (!delta) { - return; - } - int target_epoch{}; + int delta{}; { - arenas_list_mutex_type::scoped_lock lock(my_arenas_list_mutex); - __TBB_ASSERT(theMarket != nullptr, "market instance was destroyed prematurely?"); -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - if (mandatory) { - __TBB_ASSERT(delta == 1 || delta == -1, nullptr); - // Count the number of mandatory requests and proceed only for 0->1 and 1->0 transitions. - a.my_local_concurrency_requests += delta; - if ((delta > 0 && a.my_local_concurrency_requests != 1) || - (delta < 0 && a.my_local_concurrency_requests != 0)) - { - return; - } - } -#endif - a.my_total_num_workers_requested += delta; - int target_workers = 0; - // Cap target_workers into interval [0, a.my_max_num_workers] - if (a.my_total_num_workers_requested > 0) { -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - // At least one thread should be requested when mandatory concurrency - int max_num_workers = int(a.my_max_num_workers); - if (a.my_local_concurrency_requests > 0 && max_num_workers == 0) { - max_num_workers = 1; - } -#endif - target_workers = min(a.my_total_num_workers_requested, max_num_workers); - } - - delta = target_workers - a.my_num_workers_requested; - - if (delta == 0) { - return; - } - - a.my_num_workers_requested += delta; - if (a.my_num_workers_requested == 0) { - a.my_num_workers_allotted.store(0, std::memory_order_relaxed); - } - - int total_demand = my_total_demand.load(std::memory_order_relaxed) + delta; - my_total_demand.store(total_demand, std::memory_order_relaxed); - my_priority_level_demand[a.my_priority_level] += delta; - unsigned effective_soft_limit = my_num_workers_soft_limit.load(std::memory_order_relaxed); - if (my_mandatory_num_requested > 0) { - __TBB_ASSERT(effective_soft_limit == 0, nullptr); - effective_soft_limit = 1; - } - - update_allotment(effective_soft_limit); - if (delta > 0) { - // can't overflow soft_limit, but remember values request by arenas in - // my_total_demand to not prematurely release workers to RML - if (my_num_workers_requested + delta > (int)effective_soft_limit) - delta = effective_soft_limit - my_num_workers_requested; - } - else { - // the number of workers should not be decreased below my_total_demand - if (my_num_workers_requested + delta < total_demand) - delta = min(total_demand, (int)effective_soft_limit) - my_num_workers_requested; - } - my_num_workers_requested += delta; - __TBB_ASSERT(my_num_workers_requested <= (int)effective_soft_limit, nullptr); + mutex_type::scoped_lock lock(my_mutex); + // Update client's state + delta = c.update_request(mandatory_delta, workers_delta); - target_epoch = a.my_adjust_demand_target_epoch++; - } - - a.my_adjust_demand_current_epoch.wait_until(target_epoch, /* context = */ target_epoch, std::memory_order_relaxed); - // Must be called outside of any locks - my_server->adjust_job_count_estimate( delta ); - a.my_adjust_demand_current_epoch.exchange(target_epoch + 1); - a.my_adjust_demand_current_epoch.notify_relaxed(target_epoch + 1); -} + // Update market's state + my_total_demand += delta; + my_priority_level_demand[c.priority_level()] += delta; + my_mandatory_num_requested += mandatory_delta; -void market::process( job& j ) { - thread_data& td = static_cast(j); - // td.my_arena can be dead. Don't access it until arena_in_need is called - arena *a = td.my_arena; - for (int i = 0; i < 2; ++i) { - while ( (a = arena_in_need(a)) ) { - a->process(td); - } - // Workers leave market because there is no arena in need. It can happen earlier than - // adjust_job_count_estimate() decreases my_slack and RML can put this thread to sleep. - // It might result in a busy-loop checking for my_slack<0 and calling this method instantly. - // the yield refines this spinning. - if ( !i ) { - yield(); - } + update_allotment(); } -} - -void market::cleanup( job& j) { - market::enforce([this] { return theMarket != this; }, nullptr ); - governor::auto_terminate(&j); -} - -void market::acknowledge_close_connection() { - destroy(); -} - -::rml::job* market::create_one_job() { - unsigned short index = ++my_first_unused_worker_idx; - __TBB_ASSERT( index > 0, nullptr); - ITT_THREAD_SET_NAME(_T("TBB Worker Thread")); - // index serves as a hint decreasing conflicts between workers when they migrate between arenas - thread_data* td = new(cache_aligned_allocate(sizeof(thread_data))) thread_data{ index, true }; - __TBB_ASSERT( index <= my_num_workers_hard_limit, nullptr); - __TBB_ASSERT( my_workers[index - 1].load(std::memory_order_relaxed) == nullptr, nullptr); - my_workers[index - 1].store(td, std::memory_order_release); - return td; -} - -void market::add_external_thread(thread_data& td) { - context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex); - my_masters.push_front(td); -} -void market::remove_external_thread(thread_data& td) { - context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex); - my_masters.remove(td); + notify_thread_request(delta); } } // namespace r1 diff --git a/third-party/tbb/src/tbb/market.h b/third-party/tbb/src/tbb/market.h index f3891df3..85532ff1 100644 --- a/third-party/tbb/src/tbb/market.h +++ b/third-party/tbb/src/tbb/market.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,291 +17,62 @@ #ifndef _TBB_market_H #define _TBB_market_H -#include "scheduler_common.h" -#include "market_concurrent_monitor.h" -#include "intrusive_list.h" -#include "rml_tbb.h" #include "oneapi/tbb/rw_mutex.h" +#include "oneapi/tbb/tbb_allocator.h" +#include "oneapi/tbb/task_arena.h" -#include "oneapi/tbb/spin_rw_mutex.h" -#include "oneapi/tbb/task_group.h" +#include "permit_manager.h" +#include "pm_client.h" #include - -#if defined(_MSC_VER) && defined(_Wp64) - // Workaround for overzealous compiler warnings in /Wp64 mode - #pragma warning (push) - #pragma warning (disable: 4244) -#endif +#include namespace tbb { namespace detail { - -namespace d1 { -class task_scheduler_handle; -} - namespace r1 { -class task_arena_base; -class task_group_context; - -//------------------------------------------------------------------------ -// Class market -//------------------------------------------------------------------------ - -class market : no_copy, rml::tbb_client { - friend class arena; - friend class task_arena_base; - template friend class custom_scheduler; - friend class task_group_context; - friend class governor; - friend class lifetime_control; - +class market : public permit_manager { public: - //! Keys for the arena map array. The lower the value the higher priority of the arena list. - static constexpr unsigned num_priority_levels = 3; - -private: - friend void ITT_DoUnsafeOneTimeInitialization (); - friend bool finalize_impl(d1::task_scheduler_handle& handle); - - typedef intrusive_list arena_list_type; - typedef intrusive_list thread_data_list_type; + market(unsigned soft_limit); - //! Currently active global market - static market* theMarket; + pm_client* create_client(arena& a) override; + void register_client(pm_client* client, d1::constraints&) override; + void unregister_and_destroy_client(pm_client& c) override; - typedef scheduler_mutex_type global_market_mutex_type; - - //! Mutex guarding creation/destruction of theMarket, insertions/deletions in my_arenas, and cancellation propagation - static global_market_mutex_type theMarketMutex; - - //! Lightweight mutex guarding accounting operations with arenas list - typedef rw_mutex arenas_list_mutex_type; - // TODO: introduce fine-grained (per priority list) locking of arenas. - arenas_list_mutex_type my_arenas_list_mutex; - - //! Pointer to the RML server object that services this TBB instance. - rml::tbb_server* my_server; - - //! Waiting object for external and coroutine waiters. - market_concurrent_monitor my_sleep_monitor; + //! Request that arena's need in workers should be adjusted. + void adjust_demand(pm_client&, int mandatory_delta, int workers_delta) override; - //! Maximal number of workers allowed for use by the underlying resource manager - /** It can't be changed after market creation. **/ - unsigned my_num_workers_hard_limit; + //! Set number of active workers + void set_active_num_workers(int soft_limit) override; +private: + //! Recalculates the number of workers assigned to each arena in the list. + void update_allotment(); - //! Current application-imposed limit on the number of workers (see set_active_num_workers()) - /** It can't be more than my_num_workers_hard_limit. **/ - std::atomic my_num_workers_soft_limit; + //! Keys for the arena map array. The lower the value the higher priority of the arena list. + static constexpr unsigned num_priority_levels = d1::num_priority_levels; - //! Number of workers currently requested from RML - int my_num_workers_requested; + using mutex_type = d1::rw_mutex; + mutex_type my_mutex; - //! First unused index of worker - /** Used to assign indices to the new workers coming from RML, and busy part - of my_workers array. **/ - std::atomic my_first_unused_worker_idx; + //! Current application-imposed limit on the number of workers + int my_num_workers_soft_limit; //! Number of workers that were requested by all arenas on all priority levels - std::atomic my_total_demand; + int my_total_demand{0}; //! Number of workers that were requested by arenas per single priority list item - int my_priority_level_demand[num_priority_levels]; + int my_priority_level_demand[num_priority_levels] = {0}; -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY //! How many times mandatory concurrency was requested from the market - int my_mandatory_num_requested; -#endif + int my_mandatory_num_requested{0}; //! Per priority list of registered arenas - arena_list_type my_arenas[num_priority_levels]; - - //! The first arena to be checked when idle worker seeks for an arena to enter - /** The check happens in round-robin fashion. **/ - arena *my_next_arena; - - //! ABA prevention marker to assign to newly created arenas - std::atomic my_arenas_aba_epoch; - - //! Reference count controlling market object lifetime - std::atomic my_ref_count; - - //! Count of external threads attached - std::atomic my_public_ref_count; - - //! Stack size of worker threads - std::size_t my_stack_size; - - //! Shutdown mode - bool my_join_workers; - - //! The value indicating that the soft limit warning is unnecessary - static const unsigned skip_soft_limit_warning = ~0U; - - //! Either workers soft limit to be reported via runtime_warning() or skip_soft_limit_warning - std::atomic my_workers_soft_limit_to_report; - - //! Constructor - market ( unsigned workers_soft_limit, unsigned workers_hard_limit, std::size_t stack_size ); - - //! Destructor - ~market(); - - //! Destroys and deallocates market object created by market::create() - void destroy (); - - //! Recalculates the number of workers requested from RML and updates the allotment. - int update_workers_request(); - - //! Recalculates the number of workers assigned to each arena in the list. - /** The actual number of workers servicing a particular arena may temporarily - deviate from the calculated value. **/ - void update_allotment (unsigned effective_soft_limit) { - int total_demand = my_total_demand.load(std::memory_order_relaxed); - if (total_demand) { - update_allotment(my_arenas, total_demand, (int)effective_soft_limit); - } - } - - //! Returns next arena that needs more workers, or nullptr. - arena* arena_in_need(arena* prev); - - template - static void enforce (Pred pred, const char* msg) { - suppress_unused_warning(pred, msg); -#if TBB_USE_ASSERT - global_market_mutex_type::scoped_lock lock(theMarketMutex); - __TBB_ASSERT(pred(), msg); -#endif - } - - //////////////////////////////////////////////////////////////////////////////// - // Helpers to unify code branches dependent on priority feature presence - - arena* select_next_arena( arena* hint ); - - void insert_arena_into_list ( arena& a ); - - void remove_arena_from_list ( arena& a ); - - arena* arena_in_need ( arena_list_type* arenas, arena* hint ); - - int update_allotment ( arena_list_type* arenas, int total_demand, int max_workers ); - - bool is_arena_in_list( arena_list_type& arenas, arena* a ); - - bool is_arena_alive( arena* a ); - - //////////////////////////////////////////////////////////////////////////////// - // Implementation of rml::tbb_client interface methods - - version_type version () const override { return 0; } - - unsigned max_job_count () const override { return my_num_workers_hard_limit; } - - std::size_t min_stack_size () const override { return worker_stack_size(); } - - job* create_one_job () override; - - void cleanup( job& j ) override; - - void acknowledge_close_connection () override; - - void process( job& j ) override; - -public: - //! Factory method creating new market object - static market& global_market( bool is_public, unsigned max_num_workers = 0, std::size_t stack_size = 0 ); - - //! Add reference to market if theMarket exists - static bool add_ref_unsafe( global_market_mutex_type::scoped_lock& lock, bool is_public, unsigned max_num_workers = 0, std::size_t stack_size = 0 ); - - //! Creates an arena object - /** If necessary, also creates global market instance, and boosts its ref count. - Each call to create_arena() must be matched by the call to arena::free_arena(). **/ - static arena* create_arena ( int num_slots, int num_reserved_slots, - unsigned arena_index, std::size_t stack_size ); - - //! Removes the arena from the market's list - void try_destroy_arena ( arena*, uintptr_t aba_epoch, unsigned priority_level ); - - //! Removes the arena from the market's list - void detach_arena ( arena& ); - - //! Decrements market's refcount and destroys it in the end - bool release ( bool is_public, bool blocking_terminate ); - - //! Return wait list - market_concurrent_monitor& get_wait_list() { return my_sleep_monitor; } - -#if __TBB_ENQUEUE_ENFORCED_CONCURRENCY - //! Imlpementation of mandatory concurrency enabling - void enable_mandatory_concurrency_impl ( arena *a ); - - //! Inform the external thread that there is an arena with mandatory concurrency - void enable_mandatory_concurrency ( arena *a ); - - //! Inform the external thread that the arena is no more interested in mandatory concurrency - void disable_mandatory_concurrency_impl(arena* a); - - //! Inform the external thread that the arena is no more interested in mandatory concurrency - void mandatory_concurrency_disable ( arena *a ); -#endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY */ - - //! Request that arena's need in workers should be adjusted. - /** Concurrent invocations are possible only on behalf of different arenas. **/ - void adjust_demand ( arena&, int delta, bool mandatory ); - - //! Used when RML asks for join mode during workers termination. - bool must_join_workers () const { return my_join_workers; } - - //! Returns the requested stack size of worker threads. - std::size_t worker_stack_size () const { return my_stack_size; } - - //! Set number of active workers - static void set_active_num_workers( unsigned w ); - - //! Reports active parallelism level according to user's settings - static unsigned app_parallelism_limit(); - - //! Reports if any active global lifetime references are present - static unsigned is_lifetime_control_present(); - - //! Finds all contexts affected by the state change and propagates the new state to them. - /** The propagation is relayed to the market because tasks created by one - external thread can be passed to and executed by other external threads. This means - that context trees can span several arenas at once and thus state change - propagation cannot be generally localized to one arena only. **/ - template - bool propagate_task_group_state (std::atomic d1::task_group_context::*mptr_state, d1::task_group_context& src, T new_state ); - - //! List of registered external threads - thread_data_list_type my_masters; - - //! Array of pointers to the registered workers - /** Used by cancellation propagation mechanism. - Must be the last data member of the class market. **/ - std::atomic my_workers[1]; - - static unsigned max_num_workers() { - global_market_mutex_type::scoped_lock lock( theMarketMutex ); - return theMarket? theMarket->my_num_workers_hard_limit : 0; - } - - void add_external_thread(thread_data& td); - - void remove_external_thread(thread_data& td); + using clients_container_type = std::vector>; + clients_container_type my_clients[num_priority_levels]; }; // class market } // namespace r1 } // namespace detail } // namespace tbb -#if defined(_MSC_VER) && defined(_Wp64) - // Workaround for overzealous compiler warnings in /Wp64 mode - #pragma warning (pop) -#endif // warning 4244 is back - #endif /* _TBB_market_H */ diff --git a/third-party/tbb/src/tbb/misc.cpp b/third-party/tbb/src/tbb/misc.cpp index 17da1238..115a5f38 100644 --- a/third-party/tbb/src/tbb/misc.cpp +++ b/third-party/tbb/src/tbb/misc.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -92,6 +92,8 @@ void PrintExtraVersionInfo( const char* category, const char* format, ... ) { //! check for transaction support. #if _MSC_VER #include // for __cpuid +#elif __APPLE__ +#include #endif #if __TBB_x86_32 || __TBB_x86_64 @@ -131,13 +133,22 @@ void detect_cpu_features(cpu_features_type& cpu_features) { #if __TBB_x86_32 || __TBB_x86_64 const int rtm_ebx_mask = 1 << 11; const int waitpkg_ecx_mask = 1 << 5; + const int hybrid_edx_mask = 1 << 15; int registers[4] = {0}; - // Check RTM and WAITPKG + // Check RTM, WAITPKG, HYBRID check_cpuid(7, 0, registers); cpu_features.rtm_enabled = (registers[1] & rtm_ebx_mask) != 0; cpu_features.waitpkg_enabled = (registers[2] & waitpkg_ecx_mask) != 0; -#endif /* (__TBB_x86_32 || __TBB_x86_64) */ + cpu_features.hybrid = (registers[3] & hybrid_edx_mask) != 0; +#elif __APPLE__ + // Check HYBRID (hw.nperflevels > 1) + uint64_t nperflevels = 0; + size_t nperflevels_size = sizeof(nperflevels); + if (!sysctlbyname("hw.nperflevels", &nperflevels, &nperflevels_size, nullptr, 0)) { + cpu_features.hybrid = (nperflevels > 1); + } +#endif } } // namespace r1 diff --git a/third-party/tbb/src/tbb/misc.h b/third-party/tbb/src/tbb/misc.h index b11c0029..988c29b1 100644 --- a/third-party/tbb/src/tbb/misc.h +++ b/third-party/tbb/src/tbb/misc.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -211,6 +211,7 @@ T1 atomic_update(std::atomic& dst, T1 newValue, Pred compare) { struct cpu_features_type { bool rtm_enabled{false}; bool waitpkg_enabled{false}; + bool hybrid{false}; }; void detect_cpu_features(cpu_features_type& cpu_features); diff --git a/third-party/tbb/src/tbb/misc_ex.cpp b/third-party/tbb/src/tbb/misc_ex.cpp index 55be0af3..13b7b04f 100644 --- a/third-party/tbb/src/tbb/misc_ex.cpp +++ b/third-party/tbb/src/tbb/misc_ex.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -215,6 +215,7 @@ int AvailableHwConcurrency() { } fscanf(fp, ","); } + fclose(fp); return (num_cpus > 0) ? num_cpus : 1; } diff --git a/third-party/tbb/src/tbb/permit_manager.h b/third-party/tbb/src/tbb/permit_manager.h new file mode 100644 index 00000000..0a6a737c --- /dev/null +++ b/third-party/tbb/src/tbb/permit_manager.h @@ -0,0 +1,61 @@ +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_permit_manager_H +#define _TBB_permit_manager_H + +#include "oneapi/tbb/info.h" +#include "oneapi/tbb/detail/_utils.h" +#include "thread_request_serializer.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class arena; +class pm_client; + +class permit_manager : no_copy { +public: + virtual ~permit_manager() {} + virtual pm_client* create_client(arena& a) = 0; + virtual void register_client(pm_client* client, d1::constraints& constraints) = 0; + virtual void unregister_and_destroy_client(pm_client& c) = 0; + + virtual void set_active_num_workers(int soft_limit) = 0; + virtual void adjust_demand(pm_client&, int mandatory_delta, int workers_delta) = 0; + + void set_thread_request_observer(thread_request_observer& tr_observer) { + __TBB_ASSERT(!my_thread_request_observer, "set_thread_request_observer was called already?"); + my_thread_request_observer = &tr_observer; + } +protected: + void notify_thread_request(int delta) { + __TBB_ASSERT(my_thread_request_observer, "set_thread_request_observer was not called?"); + if (delta) { + my_thread_request_observer->update(delta); + } + } +private: + thread_request_observer* my_thread_request_observer{nullptr}; +}; + + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_permit_manager_H diff --git a/third-party/tbb/src/tbb/pm_client.h b/third-party/tbb/src/tbb/pm_client.h new file mode 100644 index 00000000..d08af824 --- /dev/null +++ b/third-party/tbb/src/tbb/pm_client.h @@ -0,0 +1,76 @@ +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_pm_client_H +#define _TBB_pm_client_H + +#include "arena.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class pm_client { +public: + pm_client(arena& a) : my_arena(a) {} + virtual ~pm_client() {} + + unsigned priority_level() { + return my_arena.priority_level(); + } + + void set_top_priority(bool b) { + my_arena.set_top_priority(b); + } + + int min_workers() const { + return my_min_workers; + } + + int max_workers() const { + return my_max_workers; + } + + int update_request(int mandatory_delta, int workers_delta) { + auto min_max_workers = my_arena.update_request(mandatory_delta, workers_delta); + int delta = min_max_workers.second - my_max_workers; + set_workers(min_max_workers.first, min_max_workers.second); + return delta; + } + + virtual void register_thread() = 0; + + virtual void unregister_thread() = 0; + + +protected: + void set_workers(int mn_w, int mx_w) { + __TBB_ASSERT(mn_w >= 0, nullptr); + __TBB_ASSERT(mx_w >= 0, nullptr); + my_min_workers = mn_w; + my_max_workers = mx_w; + } + + arena& my_arena; + int my_min_workers{0}; + int my_max_workers{0}; +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_pm_client_H diff --git a/third-party/tbb/src/tbb/rml_tbb.cpp b/third-party/tbb/src/tbb/rml_tbb.cpp index 4c772eae..d1cd285c 100644 --- a/third-party/tbb/src/tbb/rml_tbb.cpp +++ b/third-party/tbb/src/tbb/rml_tbb.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -49,7 +49,7 @@ namespace rml { #if _WIN32 || _WIN64 #define RML_SERVER_NAME "irml" DEBUG_SUFFIX ".dll" #elif __APPLE__ -#define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".dylib" +#define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".1.dylib" #elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX #define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".so" #elif __unix__ diff --git a/third-party/tbb/src/tbb/scheduler_common.h b/third-party/tbb/src/tbb/scheduler_common.h index 9e103657..f9e8a68d 100644 --- a/third-party/tbb/src/tbb/scheduler_common.h +++ b/third-party/tbb/src/tbb/scheduler_common.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -41,6 +41,7 @@ #include #include +#include // unique_ptr //! Mutex type for global locks in the scheduler using scheduler_mutex_type = __TBB_SCHEDULER_MUTEX_TYPE; @@ -68,6 +69,22 @@ template class task_stream; using isolation_type = std::intptr_t; constexpr isolation_type no_isolation = 0; +struct cache_aligned_deleter { + template + void operator() (T* ptr) const { + ptr->~T(); + cache_aligned_deallocate(ptr); + } +}; + +template +using cache_aligned_unique_ptr = std::unique_ptr; + +template +cache_aligned_unique_ptr make_cache_aligned_unique(Args&& ...args) { + return cache_aligned_unique_ptr(new (cache_aligned_allocate(sizeof(T))) T(std::forward(args)...)); +} + //------------------------------------------------------------------------ // Extended execute data //------------------------------------------------------------------------ @@ -225,9 +242,10 @@ inline void prolonged_pause() { std::uint64_t time_stamp = machine_time_stamp(); // _tpause function directs the processor to enter an implementation-dependent optimized state // until the Time Stamp Counter reaches or exceeds the value specified in second parameter. - // Constant "700" is ticks to wait for. + // Constant "1000" is ticks to wait for. + // TODO : Modify this parameter based on empirical study of benchmarks. // First parameter 0 selects between a lower power (cleared) or faster wakeup (set) optimized state. - _tpause(0, time_stamp + 700); + _tpause(0, time_stamp + 1000); } else #endif @@ -245,17 +263,12 @@ class stealing_loop_backoff { int my_yield_count; public: // my_yield_threshold = 100 is an experimental value. Ideally, once we start calling __TBB_Yield(), - // the time spent spinning before calling is_out_of_work() should be approximately + // the time spent spinning before calling out_of_work() should be approximately // the time it takes for a thread to be woken up. Doing so would guarantee that we do // no worse than 2x the optimal spin time. Or perhaps a time-slice quantum is the right amount. stealing_loop_backoff(int num_workers, int yields_multiplier) : my_pause_threshold{ 2 * (num_workers + 1) } -#if __APPLE__ - // threshold value tuned separately for macOS due to high cost of sched_yield there - , my_yield_threshold{10 * yields_multiplier} -#else , my_yield_threshold{100 * yields_multiplier} -#endif , my_pause_count{} , my_yield_count{} {} @@ -548,6 +561,7 @@ class alignas (max_nfs_size) task_dispatcher { #endif inline std::uintptr_t calculate_stealing_threshold(std::uintptr_t base, std::size_t stack_size) { + __TBB_ASSERT(stack_size != 0, "Stack size cannot be zero"); __TBB_ASSERT(base > stack_size / 2, "Stack anchor calculation overflow"); return base - stack_size / 2; } @@ -558,8 +572,7 @@ struct task_group_context_impl { static void register_with(d1::task_group_context&, thread_data*); static void bind_to_impl(d1::task_group_context&, thread_data*); static void bind_to(d1::task_group_context&, thread_data*); - template - static void propagate_task_group_state(d1::task_group_context&, std::atomic d1::task_group_context::*, d1::task_group_context&, T); + static void propagate_task_group_state(d1::task_group_context&, std::atomic d1::task_group_context::*, d1::task_group_context&, uint32_t); static bool cancel_group_execution(d1::task_group_context&); static bool is_group_execution_cancelled(const d1::task_group_context&); static void reset(d1::task_group_context&); diff --git a/third-party/tbb/src/tbb/task.cpp b/third-party/tbb/src/tbb/task.cpp index bd4e32df..08463bf3 100644 --- a/third-party/tbb/src/tbb/task.cpp +++ b/third-party/tbb/src/tbb/task.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -53,7 +53,7 @@ void resume(suspend_point_type* sp) { // Prolong the arena's lifetime while all coroutines are alive // (otherwise the arena can be destroyed while some tasks are suspended). arena& a = *sp->m_arena; - a.my_references += arena::ref_external; + a.my_references += arena::ref_worker; if (task_disp.m_properties.critical_task_allowed) { // The target is not in the process of executing critical task, so the resume task is not critical. @@ -67,7 +67,7 @@ void resume(suspend_point_type* sp) { // Do not access target after that point. a.advertise_new_work(); // Release our reference to my_arena. - a.on_thread_leaving(); + a.on_thread_leaving(arena::ref_worker); } } @@ -77,13 +77,13 @@ suspend_point_type* current_suspend_point() { return td.my_task_dispatcher->get_suspend_point(); } -static task_dispatcher& create_coroutine(thread_data& td) { +task_dispatcher& create_coroutine(thread_data& td) { // We may have some task dispatchers cached task_dispatcher* task_disp = td.my_arena->my_co_cache.pop(); if (!task_disp) { void* ptr = cache_aligned_allocate(sizeof(task_dispatcher)); task_disp = new(ptr) task_dispatcher(td.my_arena); - task_disp->init_suspend_point(td.my_arena, td.my_arena->my_market->worker_stack_size()); + task_disp->init_suspend_point(td.my_arena, td.my_arena->my_threading_control->worker_stack_size()); } // Prolong the arena's lifetime until all coroutines is alive // (otherwise the arena can be destroyed while some tasks are suspended). @@ -163,7 +163,7 @@ void task_dispatcher::do_post_resume_action() { case post_resume_action::register_waiter: { __TBB_ASSERT(td->my_post_resume_arg, "The post resume action must have an argument"); - static_cast(td->my_post_resume_arg)->notify(); + static_cast(td->my_post_resume_arg)->notify(); break; } case post_resume_action::cleanup: @@ -171,7 +171,7 @@ void task_dispatcher::do_post_resume_action() { __TBB_ASSERT(td->my_post_resume_arg, "The post resume action must have an argument"); task_dispatcher* to_cleanup = static_cast(td->my_post_resume_arg); // Release coroutine's reference to my_arena - td->my_arena->on_thread_leaving(); + td->my_arena->on_thread_leaving(arena::ref_external); // Cache the coroutine for possible later re-usage td->my_arena->my_co_cache.push(to_cleanup); break; @@ -186,7 +186,7 @@ void task_dispatcher::do_post_resume_action() { auto is_our_suspend_point = [sp] (market_context ctx) { return std::uintptr_t(sp) == ctx.my_uniq_addr; }; - td->my_arena->my_market->get_wait_list().notify(is_our_suspend_point); + td->my_arena->get_waiting_threads_monitor().notify(is_our_suspend_point); break; } default: @@ -218,7 +218,7 @@ void notify_waiters(std::uintptr_t wait_ctx_addr) { return wait_ctx_addr == context.my_uniq_addr; }; - r1::governor::get_thread_data()->my_arena->my_market->get_wait_list().notify(is_related_wait_ctx); + governor::get_thread_data()->my_arena->get_waiting_threads_monitor().notify(is_related_wait_ctx); } } // namespace r1 diff --git a/third-party/tbb/src/tbb/task_dispatcher.h b/third-party/tbb/src/tbb/task_dispatcher.h index f6ff3f17..20c7c731 100644 --- a/third-party/tbb/src/tbb/task_dispatcher.h +++ b/third-party/tbb/src/tbb/task_dispatcher.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2020-2022 Intel Corporation + Copyright (c) 2020-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -29,6 +29,7 @@ #include "mailbox.h" #include "itt_notify.h" #include "concurrent_monitor.h" +#include "threading_control.h" #include @@ -65,13 +66,13 @@ inline d1::task* suspend_point_type::resume_task::execute(d1::execution_data& ed execution_data_ext& ed_ext = static_cast(ed); if (ed_ext.wait_ctx) { - market_concurrent_monitor::resume_context monitor_node{{std::uintptr_t(ed_ext.wait_ctx), nullptr}, ed_ext, m_target}; + thread_control_monitor::resume_context monitor_node{{std::uintptr_t(ed_ext.wait_ctx), nullptr}, ed_ext, m_target}; // The wait_ctx is present only in external_waiter. In that case we leave the current stack // in the abandoned state to resume when waiting completes. thread_data* td = ed_ext.task_disp->m_thread_data; td->set_post_resume_action(task_dispatcher::post_resume_action::register_waiter, &monitor_node); - market_concurrent_monitor& wait_list = td->my_arena->my_market->get_wait_list(); + thread_control_monitor& wait_list = td->my_arena->get_waiting_threads_monitor(); if (wait_list.wait([&] { return !ed_ext.wait_ctx->continue_execution(); }, monitor_node)) { return nullptr; diff --git a/third-party/tbb/src/tbb/task_group_context.cpp b/third-party/tbb/src/tbb/task_group_context.cpp index 177dd555..c20b2790 100644 --- a/third-party/tbb/src/tbb/task_group_context.cpp +++ b/third-party/tbb/src/tbb/task_group_context.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -197,8 +197,7 @@ void task_group_context_impl::bind_to(d1::task_group_context& ctx, thread_data* __TBB_ASSERT(ctx.my_state.load(std::memory_order_relaxed) != d1::task_group_context::state::locked, nullptr); } -template -void task_group_context_impl::propagate_task_group_state(d1::task_group_context& ctx, std::atomic d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) { +void task_group_context_impl::propagate_task_group_state(d1::task_group_context& ctx, std::atomic d1::task_group_context::* mptr_state, d1::task_group_context& src, std::uint32_t new_state) { __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); /* 1. if ((ctx.*mptr_state).load(std::memory_order_relaxed) == new_state): Nothing to do, whether descending from "src" or not, so no need to scan. @@ -224,50 +223,6 @@ void task_group_context_impl::propagate_task_group_state(d1::task_group_context& } } -template -void thread_data::propagate_task_group_state(std::atomic d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) { - mutex::scoped_lock lock(my_context_list->m_mutex); - // Acquire fence is necessary to ensure that the subsequent node->my_next load - // returned the correct value in case it was just inserted in another thread. - // The fence also ensures visibility of the correct ctx.my_parent value. - for (context_list::iterator it = my_context_list->begin(); it != my_context_list->end(); ++it) { - d1::task_group_context& ctx = __TBB_get_object_ref(d1::task_group_context, my_node, &(*it)); - if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state) - task_group_context_impl::propagate_task_group_state(ctx, mptr_state, src, new_state); - } - // Sync up local propagation epoch with the global one. Release fence prevents - // reordering of possible store to *mptr_state after the sync point. - my_context_list->epoch.store(the_context_state_propagation_epoch.load(std::memory_order_relaxed), std::memory_order_release); -} - -template -bool market::propagate_task_group_state(std::atomic d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) { - if (src.my_may_have_children.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) - return true; - // The whole propagation algorithm is under the lock in order to ensure correctness - // in case of concurrent state changes at the different levels of the context tree. - // See comment at the bottom of scheduler.cpp - context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex); - if ((src.*mptr_state).load(std::memory_order_relaxed) != new_state) - // Another thread has concurrently changed the state. Back down. - return false; - // Advance global state propagation epoch - ++the_context_state_propagation_epoch; - // Propagate to all workers and external threads and sync up their local epochs with the global one - unsigned num_workers = my_first_unused_worker_idx; - for (unsigned i = 0; i < num_workers; ++i) { - thread_data* td = my_workers[i].load(std::memory_order_acquire); - // If the worker is only about to be registered, skip it. - if (td) - td->propagate_task_group_state(mptr_state, src, new_state); - } - // Propagate to all external threads - // The whole propagation sequence is locked, thus no contention is expected - for (thread_data_list_type::iterator it = my_masters.begin(); it != my_masters.end(); it++) - it->propagate_task_group_state(mptr_state, src, new_state); - return true; -} - bool task_group_context_impl::cancel_group_execution(d1::task_group_context& ctx) { __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); __TBB_ASSERT(ctx.my_cancellation_requested.load(std::memory_order_relaxed) <= 1, "The cancellation state can be either 0 or 1"); @@ -277,7 +232,7 @@ bool task_group_context_impl::cancel_group_execution(d1::task_group_context& ctx // not missing out on any cancellation still being propagated, and a context cannot be uncanceled.) return false; } - governor::get_thread_data()->my_arena->my_market->propagate_task_group_state(&d1::task_group_context::my_cancellation_requested, ctx, uint32_t(1)); + governor::get_thread_data()->my_arena->my_threading_control->propagate_task_group_state(&d1::task_group_context::my_cancellation_requested, ctx, uint32_t(1)); return true; } diff --git a/third-party/tbb/src/tbb/tbb.rc b/third-party/tbb/src/tbb/tbb.rc index 6c8b99fc..57e9d391 100644 --- a/third-party/tbb/src/tbb/tbb.rc +++ b/third-party/tbb/src/tbb/tbb.rc @@ -1,4 +1,4 @@ -// Copyright (c) 2005-2023 Intel Corporation +// Copyright (c) 2005-2024 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -54,7 +54,7 @@ BEGIN VALUE "CompanyName", "Intel Corporation\0" VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0" VALUE "FileVersion", TBB_VERSION "\0" - VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation. All Rights Reserved.\0" + VALUE "LegalCopyright", "Copyright 2005-2024 Intel Corporation. All Rights Reserved.\0" VALUE "LegalTrademarks", "\0" #ifndef TBB_USE_DEBUG VALUE "OriginalFilename", "tbb12.dll\0" diff --git a/third-party/tbb/src/tbb/tcm.h b/third-party/tbb/src/tbb/tcm.h new file mode 100644 index 00000000..05fe0434 --- /dev/null +++ b/third-party/tbb/src/tbb/tcm.h @@ -0,0 +1,173 @@ +/* + Copyright (c) 2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_tcm_H +#define _TBB_tcm_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Support for the TCM API return value + +typedef enum _tcm_result_t { + TCM_RESULT_SUCCESS = 0x0, + TCM_RESULT_ERROR_INVALID_ARGUMENT = 0x78000004, + TCM_RESULT_ERROR_UNKNOWN = 0x7ffffffe +} tcm_result_t; + +// Support for permit states + +enum tcm_permit_states_t { + TCM_PERMIT_STATE_VOID, + TCM_PERMIT_STATE_INACTIVE, + TCM_PERMIT_STATE_PENDING, + TCM_PERMIT_STATE_IDLE, + TCM_PERMIT_STATE_ACTIVE +}; + +typedef uint8_t tcm_permit_state_t; + +// Support for permit flags + +typedef struct _tcm_permit_flags_t { + uint32_t stale : 1; + uint32_t rigid_concurrency : 1; + uint32_t exclusive : 1; + uint32_t reserved : 29; +} tcm_permit_flags_t; + +typedef struct _tcm_callback_flags_t { + uint32_t new_concurrency : 1; + uint32_t new_state : 1; + uint32_t reserved : 30; +} tcm_callback_flags_t; + +// Support for cpu masks + +struct hwloc_bitmap_s; +typedef struct hwloc_bitmap_s* hwloc_bitmap_t; +typedef hwloc_bitmap_t tcm_cpu_mask_t; + +// Support for ids + +typedef uint64_t tcm_client_id_t; + +// Support for permits + +typedef struct _tcm_permit_t { + uint32_t* concurrencies; + tcm_cpu_mask_t* cpu_masks; + uint32_t size; + tcm_permit_state_t state; + tcm_permit_flags_t flags; +} tcm_permit_t; + +// Support for permit handle + +typedef struct tcm_permit_rep_t* tcm_permit_handle_t; + +// Support for constraints + +typedef int32_t tcm_numa_node_t; +typedef int32_t tcm_core_type_t; + +const int8_t tcm_automatic = -1; +const int8_t tcm_any = -2; + +#define TCM_PERMIT_REQUEST_CONSTRAINTS_INITIALIZER {tcm_automatic, tcm_automatic, NULL, \ + tcm_automatic, tcm_automatic, tcm_automatic} + +typedef struct _tcm_cpu_constraints_t { + int32_t min_concurrency; + int32_t max_concurrency; + tcm_cpu_mask_t mask; + tcm_numa_node_t numa_id; + tcm_core_type_t core_type_id; + int32_t threads_per_core; +} tcm_cpu_constraints_t; + +// Support for priorities + +enum tcm_request_priorities_t { + TCM_REQUEST_PRIORITY_LOW = (INT32_MAX / 4) * 1, + TCM_REQUEST_PRIORITY_NORMAL = (INT32_MAX / 4) * 2, + TCM_REQUEST_PRIORITY_HIGH = (INT32_MAX / 4) * 3 +}; + +typedef int32_t tcm_request_priority_t; + +// Support for requests + +#define TCM_PERMIT_REQUEST_INITIALIZER {tcm_automatic, tcm_automatic, \ + NULL, 0, TCM_REQUEST_PRIORITY_NORMAL, {}, {}} + +typedef struct _tcm_permit_request_t { + int32_t min_sw_threads; + int32_t max_sw_threads; + tcm_cpu_constraints_t* cpu_constraints; + uint32_t constraints_size; + tcm_request_priority_t priority; + tcm_permit_flags_t flags; + char reserved[4]; +} tcm_permit_request_t; + +// Support for client callback + +typedef tcm_result_t (*tcm_callback_t)(tcm_permit_handle_t p, void* callback_arg, tcm_callback_flags_t); + +#if _WIN32 + #define __TCM_EXPORT __declspec(dllexport) +#else + #define __TCM_EXPORT +#endif + + +__TCM_EXPORT tcm_result_t tcmConnect(tcm_callback_t callback, + tcm_client_id_t *client_id); +__TCM_EXPORT tcm_result_t tcmDisconnect(tcm_client_id_t client_id); + +__TCM_EXPORT tcm_result_t tcmRequestPermit(tcm_client_id_t client_id, + tcm_permit_request_t request, + void* callback_arg, + tcm_permit_handle_t* permit_handle, + tcm_permit_t* permit); + +__TCM_EXPORT tcm_result_t tcmGetPermitData(tcm_permit_handle_t permit_handle, + tcm_permit_t* permit); + +__TCM_EXPORT tcm_result_t tcmReleasePermit(tcm_permit_handle_t permit); + +__TCM_EXPORT tcm_result_t tcmIdlePermit(tcm_permit_handle_t permit_handle); + +__TCM_EXPORT tcm_result_t tcmDeactivatePermit(tcm_permit_handle_t permit_handle); + +__TCM_EXPORT tcm_result_t tcmActivatePermit(tcm_permit_handle_t permit_handle); + +__TCM_EXPORT tcm_result_t tcmRegisterThread(tcm_permit_handle_t permit_handle); + +__TCM_EXPORT tcm_result_t tcmUnregisterThread(); + +__TCM_EXPORT tcm_result_t tcmGetVersionInfo(char* buffer, uint32_t buffer_size); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* _TBB_tcm_H */ diff --git a/third-party/tbb/src/tbb/tcm_adaptor.cpp b/third-party/tbb/src/tbb/tcm_adaptor.cpp new file mode 100644 index 00000000..e20ebb83 --- /dev/null +++ b/third-party/tbb/src/tbb/tcm_adaptor.cpp @@ -0,0 +1,321 @@ +/* + Copyright (c) 2023-2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "oneapi/tbb/detail/_intrusive_list_node.h" +#include "oneapi/tbb/detail/_template_helpers.h" +#include "oneapi/tbb/task_arena.h" + +#include "pm_client.h" +#include "dynamic_link.h" +#include "misc.h" +#include "tcm.h" +#include "tcm_adaptor.h" + +#include + +namespace tbb { +namespace detail { +namespace r1 { + +namespace { +#if __TBB_WEAK_SYMBOLS_PRESENT +#pragma weak tcmConnect +#pragma weak tcmDisconnect +#pragma weak tcmRequestPermit +#pragma weak tcmGetPermitData +#pragma weak tcmReleasePermit +#pragma weak tcmIdlePermit +#pragma weak tcmDeactivatePermit +#pragma weak tcmActivatePermit +#pragma weak tcmRegisterThread +#pragma weak tcmUnregisterThread +#pragma weak tcmGetVersionInfo +#endif /* __TBB_WEAK_SYMBOLS_PRESENT */ + +tcm_result_t(*tcm_connect)(tcm_callback_t callback, tcm_client_id_t* client_id){nullptr}; +tcm_result_t(*tcm_disconnect)(tcm_client_id_t client_id){ nullptr }; +tcm_result_t(*tcm_request_permit)(tcm_client_id_t client_id, tcm_permit_request_t request, + void* callback_arg, tcm_permit_handle_t* permit_handle, tcm_permit_t* permit){nullptr}; +tcm_result_t(*tcm_get_permit_data)(tcm_permit_handle_t permit_handle, tcm_permit_t* permit){nullptr}; +tcm_result_t(*tcm_release_permit)(tcm_permit_handle_t permit){nullptr}; +tcm_result_t(*tcm_idle_permit)(tcm_permit_handle_t permit_handle){nullptr}; +tcm_result_t(*tcm_deactivate_permit)(tcm_permit_handle_t permit_handle){nullptr}; +tcm_result_t(*tcm_activate_permit)(tcm_permit_handle_t permit_handle){nullptr}; +tcm_result_t(*tcm_register_thread)(tcm_permit_handle_t permit_handle){nullptr}; +tcm_result_t(*tcm_unregister_thread)(){nullptr}; +tcm_result_t (*tcm_get_version_info)(char* buffer, uint32_t buffer_size){nullptr}; + +static const dynamic_link_descriptor tcm_link_table[] = { + DLD(tcmConnect, tcm_connect), + DLD(tcmDisconnect, tcm_disconnect), + DLD(tcmRequestPermit, tcm_request_permit), + DLD(tcmGetPermitData, tcm_get_permit_data), + DLD(tcmReleasePermit, tcm_release_permit), + DLD(tcmIdlePermit, tcm_idle_permit), + DLD(tcmDeactivatePermit, tcm_deactivate_permit), + DLD(tcmActivatePermit, tcm_activate_permit), + DLD(tcmRegisterThread, tcm_register_thread), + DLD(tcmUnregisterThread, tcm_unregister_thread), + DLD(tcmGetVersionInfo, tcm_get_version_info) +}; + +#if TBB_USE_DEBUG +#define DEBUG_SUFFIX "_debug" +#else +#define DEBUG_SUFFIX +#endif /* TBB_USE_DEBUG */ + +#if _WIN32 || _WIN64 +#define LIBRARY_EXTENSION ".dll" +#define LIBRARY_PREFIX +#elif __unix__ +#define LIBRARY_EXTENSION ".so.1" +#define LIBRARY_PREFIX "lib" +#else +#define LIBRARY_EXTENSION +#define LIBRARY_PREFIX +#endif /* __unix__ */ + +#define TCMLIB_NAME LIBRARY_PREFIX "tcm" DEBUG_SUFFIX LIBRARY_EXTENSION + +static bool tcm_functions_loaded{ false }; +} + +class tcm_client : public pm_client { + using tcm_client_mutex_type = d1::mutex; +public: + tcm_client(tcm_adaptor& adaptor, arena& a) : pm_client(a), my_tcm_adaptor(adaptor) {} + + ~tcm_client() { + if (my_permit_handle) { + __TBB_ASSERT(tcm_release_permit, nullptr); + auto res = tcm_release_permit(my_permit_handle); + __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr); + } + } + + int update_concurrency(uint32_t concurrency) { + return my_arena.update_concurrency(concurrency); + } + + unsigned priority_level() { + return my_arena.priority_level(); + } + + tcm_permit_request_t& permit_request() { + return my_permit_request; + } + + tcm_permit_handle_t& permit_handle() { + return my_permit_handle; + } + + void actualize_permit() { + __TBB_ASSERT(tcm_get_permit_data, nullptr); + int delta{}; + { + tcm_client_mutex_type::scoped_lock lock(my_permit_mutex); + + uint32_t new_concurrency{}; + tcm_permit_t new_permit{ &new_concurrency, nullptr, 1, TCM_PERMIT_STATE_VOID, {} }; + auto res = tcm_get_permit_data(my_permit_handle, &new_permit); + __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr); + + // The permit has changed during the reading, so the callback will be invoked soon one more time and + // we can just skip this renegotiation iteration. + if (!new_permit.flags.stale) { + // If there is no other demand in TCM, the permit may still have granted concurrency but + // be in the deactivated state thus we enforce 0 allotment to preserve arena invariants. + delta = update_concurrency(new_permit.state != TCM_PERMIT_STATE_INACTIVE ? new_concurrency : 0); + } + } + if (delta) { + my_tcm_adaptor.notify_thread_request(delta); + } + } + + void request_permit(tcm_client_id_t client_id) { + __TBB_ASSERT(tcm_request_permit, nullptr); + + my_permit_request.max_sw_threads = max_workers(); + my_permit_request.min_sw_threads = my_permit_request.max_sw_threads == 0 ? 0 : min_workers(); + + if (my_permit_request.constraints_size > 0) { + my_permit_request.cpu_constraints->min_concurrency = my_permit_request.min_sw_threads; + my_permit_request.cpu_constraints->max_concurrency = my_permit_request.max_sw_threads; + } + + __TBB_ASSERT(my_permit_request.max_sw_threads >= my_permit_request.min_sw_threads, nullptr); + + tcm_result_t res = tcm_request_permit(client_id, my_permit_request, this, &my_permit_handle, nullptr); + __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr); + } + + void deactivate_permit() { + __TBB_ASSERT(tcm_deactivate_permit, nullptr); + tcm_result_t res = tcm_deactivate_permit(my_permit_handle); + __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr); + } + + void init(d1::constraints& constraints) { + __TBB_ASSERT(tcm_request_permit, nullptr); + __TBB_ASSERT(tcm_deactivate_permit, nullptr); + + if (constraints.core_type != d1::task_arena::automatic || + constraints.numa_id != d1::task_arena::automatic || + constraints.max_threads_per_core != d1::task_arena::automatic) + { + my_permit_constraints.max_concurrency = constraints.max_concurrency; + my_permit_constraints.min_concurrency = 0; + my_permit_constraints.core_type_id = constraints.core_type; + my_permit_constraints.numa_id = constraints.numa_id; + my_permit_constraints.threads_per_core = constraints.max_threads_per_core; + + my_permit_request.cpu_constraints = &my_permit_constraints; + my_permit_request.constraints_size = 1; + } + + my_permit_request.min_sw_threads = 0; + my_permit_request.max_sw_threads = 0; + } + + void register_thread() override { + __TBB_ASSERT(tcm_register_thread, nullptr); + auto return_code = tcm_register_thread(my_permit_handle); + __TBB_ASSERT_EX(return_code == TCM_RESULT_SUCCESS, nullptr); + } + + void unregister_thread() override { + __TBB_ASSERT(tcm_unregister_thread, nullptr); + auto return_code = tcm_unregister_thread(); + __TBB_ASSERT_EX(return_code == TCM_RESULT_SUCCESS, nullptr); + } + +private: + tcm_cpu_constraints_t my_permit_constraints = TCM_PERMIT_REQUEST_CONSTRAINTS_INITIALIZER; + tcm_permit_request_t my_permit_request = TCM_PERMIT_REQUEST_INITIALIZER; + tcm_permit_handle_t my_permit_handle{}; + tcm_client_mutex_type my_permit_mutex; + tcm_adaptor& my_tcm_adaptor; +}; + +//------------------------------------------------------------------------ +// tcm_adaptor_impl +//------------------------------------------------------------------------ + +struct tcm_adaptor_impl { + using demand_mutex_type = d1::mutex; + demand_mutex_type my_demand_mutex; + tcm_client_id_t client_id{}; + + tcm_adaptor_impl(tcm_client_id_t id) : client_id(id) + {} +}; + +//------------------------------------------------------------------------ +// tcm_adaptor +//------------------------------------------------------------------------ + +tcm_result_t renegotiation_callback(tcm_permit_handle_t, void* client_ptr, tcm_callback_flags_t) { + __TBB_ASSERT(client_ptr, nullptr); + static_cast(client_ptr)->actualize_permit(); + return TCM_RESULT_SUCCESS; +} + +void tcm_adaptor::initialize() { + tcm_functions_loaded = dynamic_link(TCMLIB_NAME, tcm_link_table, /* tcm_link_table size = */ 11); +} + +bool tcm_adaptor::is_initialized() { + return tcm_functions_loaded; +} + +void tcm_adaptor::print_version() { + if (is_initialized()) { + __TBB_ASSERT(tcm_get_version_info, nullptr); + char buffer[1024]; + tcm_get_version_info(buffer, 1024); + std::fprintf(stderr, "%.*s", 1024, buffer); + } +} + +tcm_adaptor::tcm_adaptor() { + __TBB_ASSERT(tcm_connect, nullptr); + tcm_client_id_t client_id{}; + auto return_code = tcm_connect(renegotiation_callback, &client_id); + if (return_code == TCM_RESULT_SUCCESS) { + my_impl = make_cache_aligned_unique(client_id); + } +} + +tcm_adaptor::~tcm_adaptor() { + if (my_impl) { + __TBB_ASSERT(tcm_disconnect, nullptr); + auto return_code = tcm_disconnect(my_impl->client_id); + __TBB_ASSERT_EX(return_code == TCM_RESULT_SUCCESS, nullptr); + my_impl = nullptr; + } +} + +bool tcm_adaptor::is_connected() { + return my_impl != nullptr; +} + +pm_client* tcm_adaptor::create_client(arena& a) { + return new (cache_aligned_allocate(sizeof(tcm_client))) tcm_client(*this, a); +} + +void tcm_adaptor::register_client(pm_client* c, d1::constraints& constraints) { + static_cast(c)->init(constraints); +} + +void tcm_adaptor::unregister_and_destroy_client(pm_client& c) { + auto& client = static_cast(c); + + { + tcm_adaptor_impl::demand_mutex_type::scoped_lock lock(my_impl->my_demand_mutex); + client.~tcm_client(); + } + cache_aligned_deallocate(&client); +} + +void tcm_adaptor::set_active_num_workers(int) {} + + +void tcm_adaptor::adjust_demand(pm_client& c, int mandatory_delta, int workers_delta) { + __TBB_ASSERT(-1 <= mandatory_delta && mandatory_delta <= 1, nullptr); + + auto& client = static_cast(c); + { + tcm_adaptor_impl::demand_mutex_type::scoped_lock lock(my_impl->my_demand_mutex); + + // Update client's state + workers_delta = client.update_request(mandatory_delta, workers_delta); + if (workers_delta == 0) return; + + if (client.max_workers() == 0) { + client.deactivate_permit(); + } else { + client.request_permit(my_impl->client_id); + } + } + + client.actualize_permit(); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third-party/tbb/src/tbb/tcm_adaptor.h b/third-party/tbb/src/tbb/tcm_adaptor.h new file mode 100644 index 00000000..f9f4d527 --- /dev/null +++ b/third-party/tbb/src/tbb/tcm_adaptor.h @@ -0,0 +1,63 @@ +/* + Copyright (c) 2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_tcm_adaptor_H +#define _TBB_tcm_adaptor_H + +#include "scheduler_common.h" + +#include "permit_manager.h" +#include "pm_client.h" + +namespace tbb { +namespace detail { +namespace r1 { + +struct tcm_adaptor_impl; + +//------------------------------------------------------------------------ +// Class tcm_adaptor +//------------------------------------------------------------------------ + +class tcm_adaptor : public permit_manager { +public: + tcm_adaptor(); + ~tcm_adaptor(); + + pm_client* create_client(arena& a) override; + void register_client(pm_client* client, d1::constraints& constraints) override; + void unregister_and_destroy_client(pm_client& c) override; + + void set_active_num_workers(int soft_limit) override; + + void adjust_demand(pm_client& c, int mandatory_delta, int workers_delta) override; + + bool is_connected(); + + static void initialize(); + static bool is_initialized(); + static void print_version(); +private: + cache_aligned_unique_ptr my_impl; + + friend class tcm_client; +}; // class tcm_adaptor + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif /* _TBB_tcm_adaptor_H */ diff --git a/third-party/tbb/src/tbb/thread_control_monitor.h b/third-party/tbb/src/tbb/thread_control_monitor.h new file mode 100644 index 00000000..f9c3cacc --- /dev/null +++ b/third-party/tbb/src/tbb/thread_control_monitor.h @@ -0,0 +1,116 @@ +/* + Copyright (c) 2021-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_thread_control_monitor_H +#define __TBB_thread_control_monitor_H + +#include "concurrent_monitor.h" +#include "scheduler_common.h" + +#include + +namespace tbb { +namespace detail { +namespace r1 { + +struct market_context { + market_context() = default; + + market_context(std::uintptr_t first_addr, arena* a) : + my_uniq_addr(first_addr), my_arena_addr(a) + {} + + std::uintptr_t my_uniq_addr{0}; + arena* my_arena_addr{nullptr}; +}; + +#if __TBB_RESUMABLE_TASKS +class resume_node : public wait_node { + using base_type = wait_node; +public: + resume_node(market_context ctx, execution_data_ext& ed_ext, task_dispatcher& target) + : base_type(ctx), my_curr_dispatcher(ed_ext.task_disp), my_target_dispatcher(&target) + , my_suspend_point(my_curr_dispatcher->get_suspend_point()) + {} + + ~resume_node() override { + if (this->my_skipped_wakeup) { + spin_wait_until_eq(this->my_notify_calls, 1); + } + + poison_pointer(my_curr_dispatcher); + poison_pointer(my_target_dispatcher); + poison_pointer(my_suspend_point); + } + + void init() override { + base_type::init(); + } + + void wait() override { + my_curr_dispatcher->resume(*my_target_dispatcher); + __TBB_ASSERT(!this->my_is_in_list.load(std::memory_order_relaxed), "Still in the queue?"); + } + + void reset() override { + base_type::reset(); + spin_wait_until_eq(this->my_notify_calls, 1); + my_notify_calls.store(0, std::memory_order_relaxed); + } + + // notify is called (perhaps, concurrently) twice from: + // - concurrent_monitor::notify + // - post_resume_action::register_waiter + // The second notify is called after thread switches the stack + // (Because we can not call resume while the stack is occupied) + // We need calling resume only when both notifications are performed. + void notify() override { + if (++my_notify_calls == 2) { + r1::resume(my_suspend_point); + } + } + +private: + friend class thread_data; + friend struct suspend_point_type::resume_task; + task_dispatcher* my_curr_dispatcher; + task_dispatcher* my_target_dispatcher; + suspend_point_type* my_suspend_point; + std::atomic my_notify_calls{0}; +}; +#endif // __TBB_RESUMABLE_TASKS + +class thread_control_monitor : public concurrent_monitor_base { + using base_type = concurrent_monitor_base; +public: + using base_type::base_type; + + ~thread_control_monitor() { + destroy(); + } + + /** per-thread descriptor for concurrent_monitor */ + using thread_context = sleep_node; +#if __TBB_RESUMABLE_TASKS + using resume_context = resume_node; +#endif +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // __TBB_thread_control_monitor_H diff --git a/third-party/tbb/src/tbb/thread_data.h b/third-party/tbb/src/tbb/thread_data.h index 808f3cc3..9dfa492a 100644 --- a/third-party/tbb/src/tbb/thread_data.h +++ b/third-party/tbb/src/tbb/thread_data.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2020-2022 Intel Corporation + Copyright (c) 2020-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ #include "mailbox.h" #include "misc.h" // FastRandom #include "small_object_pool_impl.h" +#include "intrusive_list.h" #include @@ -39,8 +40,9 @@ class task; class arena_slot; class task_group_context; class task_dispatcher; +class thread_dispatcher_client; -class context_list : public intrusive_list { +class context_list : public intrusive_list { public: bool orphaned{false}; @@ -61,10 +63,10 @@ class context_list : public intrusive_list { cache_aligned_deallocate(this); } - void remove(intrusive_list_node& val) { + void remove(d1::intrusive_list_node& val) { mutex::scoped_lock lock(m_mutex); - intrusive_list::remove(val); + intrusive_list::remove(val); if (orphaned && empty()) { lock.release(); @@ -72,10 +74,10 @@ class context_list : public intrusive_list { } } - void push_front(intrusive_list_node& val) { + void push_front(d1::intrusive_list_node& val) { mutex::scoped_lock lock(m_mutex); - intrusive_list::push_front(val); + intrusive_list::push_front(val); } void orphan() { @@ -93,14 +95,15 @@ class context_list : public intrusive_list { // Thread Data //------------------------------------------------------------------------ class thread_data : public ::rml::job - , public intrusive_list_node + , public d1::intrusive_list_node , no_copy { public: thread_data(unsigned short index, bool is_worker) : my_arena_index{ index } , my_is_worker{ is_worker } , my_task_dispatcher{ nullptr } - , my_arena{} + , my_arena{ nullptr } + , my_last_client{ nullptr } , my_arena_slot{} , my_random{ this } , my_last_observer{ nullptr } @@ -134,8 +137,7 @@ class thread_data : public ::rml::job void detach_task_dispatcher(); void enter_task_dispatcher(task_dispatcher& task_disp, std::uintptr_t stealing_threshold); void leave_task_dispatcher(); - template - void propagate_task_group_state(std::atomic d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state); + void propagate_task_group_state(std::atomic d1::task_group_context::* mptr_state, d1::task_group_context& src, uint32_t new_state); //! Index of the arena slot the scheduler occupies now, or occupied last time unsigned short my_arena_index; @@ -149,6 +151,8 @@ class thread_data : public ::rml::job //! The arena that I own (if external thread) or am servicing at the moment (if worker) arena* my_arena; + thread_dispatcher_client* my_last_client; + //! Pointer to the slot in the arena we own at the moment arena_slot* my_arena_slot; @@ -232,6 +236,21 @@ inline void thread_data::leave_task_dispatcher() { detach_task_dispatcher(); } +inline void thread_data::propagate_task_group_state(std::atomic d1::task_group_context::* mptr_state, d1::task_group_context& src, std::uint32_t new_state) { + mutex::scoped_lock lock(my_context_list->m_mutex); + // Acquire fence is necessary to ensure that the subsequent node->my_next load + // returned the correct value in case it was just inserted in another thread. + // The fence also ensures visibility of the correct ctx.my_parent value. + for (context_list::iterator it = my_context_list->begin(); it != my_context_list->end(); ++it) { + d1::task_group_context& ctx = __TBB_get_object_ref(d1::task_group_context, my_node, &(*it)); + if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state) + task_group_context_impl::propagate_task_group_state(ctx, mptr_state, src, new_state); + } + // Sync up local propagation epoch with the global one. Release fence prevents + // reordering of possible store to *mptr_state after the sync point. + my_context_list->epoch.store(the_context_state_propagation_epoch.load(std::memory_order_relaxed), std::memory_order_release); +} + } // namespace r1 } // namespace detail } // namespace tbb diff --git a/third-party/tbb/src/tbb/thread_dispatcher.cpp b/third-party/tbb/src/tbb/thread_dispatcher.cpp new file mode 100644 index 00000000..69a108d6 --- /dev/null +++ b/third-party/tbb/src/tbb/thread_dispatcher.cpp @@ -0,0 +1,236 @@ +/* + Copyright (c) 2022-2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "thread_dispatcher.h" +#include "threading_control.h" + +namespace tbb { +namespace detail { +namespace r1 { + +thread_dispatcher::thread_dispatcher(threading_control& tc, unsigned hard_limit, std::size_t stack_size) + : my_threading_control(tc) + , my_num_workers_hard_limit(hard_limit) + , my_stack_size(stack_size) +{ + my_server = governor::create_rml_server( *this ); + __TBB_ASSERT( my_server, "Failed to create RML server" ); +} + +thread_dispatcher::~thread_dispatcher() { + poison_pointer(my_server); +} + +thread_dispatcher_client* thread_dispatcher::select_next_client(thread_dispatcher_client* hint) { + unsigned next_client_priority_level = num_priority_levels; + if (hint) { + next_client_priority_level = hint->priority_level(); + } + + for (unsigned idx = 0; idx < next_client_priority_level; ++idx) { + if (!my_client_list[idx].empty()) { + return &*my_client_list[idx].begin(); + } + } + + return hint; +} + +thread_dispatcher_client* thread_dispatcher::create_client(arena& a) { + return new (cache_aligned_allocate(sizeof(thread_dispatcher_client))) thread_dispatcher_client(a, my_clients_aba_epoch); +} + + +void thread_dispatcher::register_client(thread_dispatcher_client* client) { + client_list_mutex_type::scoped_lock lock(my_list_mutex); + insert_client(*client); +} + +bool thread_dispatcher::try_unregister_client(thread_dispatcher_client* client, std::uint64_t aba_epoch, unsigned priority) { + __TBB_ASSERT(client, nullptr); + // we hold reference to the server, so market cannot be destroyed at any moment here + __TBB_ASSERT(!is_poisoned(my_server), nullptr); + my_list_mutex.lock(); + for (auto& it : my_client_list[priority]) { + if (client == &it) { + if (it.get_aba_epoch() == aba_epoch) { + // Client is alive + // Acquire my_references to sync with threads that just left the arena + // Pay attention that references should be read before workers_requested because + // if references is no zero some other thread might call adjust_demand and lead to + // a race over workers_requested + if (!client->references() && !client->has_request()) { + // Client is abandoned. Destroy it. + remove_client(*client); + ++my_clients_aba_epoch; + + my_list_mutex.unlock(); + destroy_client(client); + + return true; + } + } + break; + } + } + my_list_mutex.unlock(); + return false; +} + +void thread_dispatcher::destroy_client(thread_dispatcher_client* client) { + client->~thread_dispatcher_client(); + cache_aligned_deallocate(client); +} + +// Should be called under lock +void thread_dispatcher::insert_client(thread_dispatcher_client& client) { + __TBB_ASSERT(client.priority_level() < num_priority_levels, nullptr); + my_client_list[client.priority_level()].push_front(client); + + __TBB_ASSERT(!my_next_client || my_next_client->priority_level() < num_priority_levels, nullptr); + my_next_client = select_next_client(my_next_client); +} + +// Should be called under lock +void thread_dispatcher::remove_client(thread_dispatcher_client& client) { + __TBB_ASSERT(client.priority_level() < num_priority_levels, nullptr); + my_client_list[client.priority_level()].remove(client); + + if (my_next_client == &client) { + my_next_client = nullptr; + } + my_next_client = select_next_client(my_next_client); +} + +bool thread_dispatcher::is_client_alive(thread_dispatcher_client* client) { + if (!client) { + return false; + } + + // Still cannot access internals of the client since the object itself might be destroyed. + for (auto& priority_list : my_client_list) { + for (auto& c : priority_list) { + if (client == &c) { + return true; + } + } + } + return false; +} + +thread_dispatcher_client* thread_dispatcher::client_in_need(client_list_type* clients, thread_dispatcher_client* hint) { + // TODO: make sure client with higher priority returned only if there are available slots in it. + hint = select_next_client(hint); + if (!hint) { + return nullptr; + } + + client_list_type::iterator it = hint; + unsigned curr_priority_level = hint->priority_level(); + __TBB_ASSERT(it != clients[curr_priority_level].end(), nullptr); + do { + thread_dispatcher_client& t = *it; + if (++it == clients[curr_priority_level].end()) { + do { + ++curr_priority_level %= num_priority_levels; + } while (clients[curr_priority_level].empty()); + it = clients[curr_priority_level].begin(); + } + if (t.try_join()) { + return &t; + } + } while (it != hint); + return nullptr; +} + +thread_dispatcher_client* thread_dispatcher::client_in_need(thread_dispatcher_client* prev) { + client_list_mutex_type::scoped_lock lock(my_list_mutex, /*is_writer=*/false); + if (is_client_alive(prev)) { + return client_in_need(my_client_list, prev); + } + return client_in_need(my_client_list, my_next_client); +} + +bool thread_dispatcher::is_any_client_in_need() { + client_list_mutex_type::scoped_lock lock(my_list_mutex, /*is_writer=*/false); + for (auto& priority_list : my_client_list) { + for (auto& client : priority_list) { + if (client.is_joinable()) { + return true; + } + } + } + return false; +} + +void thread_dispatcher::adjust_job_count_estimate(int delta) { + my_server->adjust_job_count_estimate(delta); +} + +void thread_dispatcher::release(bool blocking_terminate) { + my_join_workers = blocking_terminate; + my_server->request_close_connection(); +} + +void thread_dispatcher::process(job& j) { + thread_data& td = static_cast(j); + // td.my_last_client can be dead. Don't access it until client_in_need is called + thread_dispatcher_client* client = td.my_last_client; + for (int i = 0; i < 2; ++i) { + while ((client = client_in_need(client)) ) { + td.my_last_client = client; + client->process(td); + } + // Workers leave thread_dispatcher because there is no client in need. It can happen earlier than + // adjust_job_count_estimate() decreases my_slack and RML can put this thread to sleep. + // It might result in a busy-loop checking for my_slack<0 and calling this method instantly. + // the yield refines this spinning. + if ( !i ) { + yield(); + } + } +} + + +//! Used when RML asks for join mode during workers termination. +bool thread_dispatcher::must_join_workers() const { return my_join_workers; } + +//! Returns the requested stack size of worker threads. +std::size_t thread_dispatcher::worker_stack_size() const { return my_stack_size; } + +void thread_dispatcher::acknowledge_close_connection() { + my_threading_control.destroy(); +} + +::rml::job* thread_dispatcher::create_one_job() { + unsigned short index = ++my_first_unused_worker_idx; + __TBB_ASSERT(index > 0, nullptr); + ITT_THREAD_SET_NAME(_T("TBB Worker Thread")); + // index serves as a hint decreasing conflicts between workers when they migrate between arenas + thread_data* td = new (cache_aligned_allocate(sizeof(thread_data))) thread_data{ index, true }; + __TBB_ASSERT(index <= my_num_workers_hard_limit, nullptr); + my_threading_control.register_thread(*td); + return td; +} + +void thread_dispatcher::cleanup(job& j) { + my_threading_control.unregister_thread(static_cast(j)); + governor::auto_terminate(&j); +} + +} // namespace r1 +} // namespace detail +} // namespace tbb diff --git a/third-party/tbb/src/tbb/thread_dispatcher.h b/third-party/tbb/src/tbb/thread_dispatcher.h new file mode 100644 index 00000000..e511e2b7 --- /dev/null +++ b/third-party/tbb/src/tbb/thread_dispatcher.h @@ -0,0 +1,107 @@ +/* + Copyright (c) 2022-2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_thread_dispatcher_H +#define _TBB_thread_dispatcher_H + +#include "oneapi/tbb/detail/_config.h" +#include "oneapi/tbb/detail/_utils.h" +#include "oneapi/tbb/rw_mutex.h" +#include "oneapi/tbb/task_arena.h" + +#include "arena.h" +#include "governor.h" +#include "thread_data.h" +#include "rml_tbb.h" +#include "thread_dispatcher_client.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class threading_control_impl; + +class thread_dispatcher : no_copy, rml::tbb_client { + using client_list_type = intrusive_list; + using client_list_mutex_type = d1::rw_mutex; +public: + thread_dispatcher(threading_control& tc, unsigned hard_limit, std::size_t stack_size); + ~thread_dispatcher(); + + thread_dispatcher_client* create_client(arena& a); + void register_client(thread_dispatcher_client* client); + bool try_unregister_client(thread_dispatcher_client* client, std::uint64_t aba_epoch, unsigned priority); + bool is_any_client_in_need(); + + void adjust_job_count_estimate(int delta); + void release(bool blocking_terminate); + void process(job& j) override; + //! Used when RML asks for join mode during workers termination. + bool must_join_workers() const; + //! Returns the requested stack size of worker threads. + std::size_t worker_stack_size() const; + +private: + version_type version () const override { return 0; } + unsigned max_job_count () const override { return my_num_workers_hard_limit; } + std::size_t min_stack_size () const override { return worker_stack_size(); } + void cleanup(job& j) override; + void acknowledge_close_connection() override; + ::rml::job* create_one_job() override; + + thread_dispatcher_client* select_next_client(thread_dispatcher_client* hint); + void destroy_client(thread_dispatcher_client* client); + void insert_client(thread_dispatcher_client& client); + void remove_client(thread_dispatcher_client& client); + bool is_client_alive(thread_dispatcher_client* client); + thread_dispatcher_client* client_in_need(client_list_type* clients, thread_dispatcher_client* hint); + thread_dispatcher_client* client_in_need(thread_dispatcher_client* prev); + + friend class threading_control_impl; + static constexpr unsigned num_priority_levels = d1::num_priority_levels; + client_list_mutex_type my_list_mutex; + client_list_type my_client_list[num_priority_levels]; + + thread_dispatcher_client* my_next_client{nullptr}; + + //! Shutdown mode + bool my_join_workers{false}; + + threading_control& my_threading_control; + + //! ABA prevention marker to assign to newly created clients + std::atomic my_clients_aba_epoch{0}; + + //! Maximal number of workers allowed for use by the underlying resource manager + /** It can't be changed after thread_dispatcher creation. **/ + unsigned my_num_workers_hard_limit{0}; + + //! Stack size of worker threads + std::size_t my_stack_size{0}; + + //! First unused index of worker + /** Used to assign indices to the new workers coming from RML **/ + std::atomic my_first_unused_worker_idx{0}; + + //! Pointer to the RML server object that services this TBB instance. + rml::tbb_server* my_server{nullptr}; +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_thread_dispatcher_H diff --git a/third-party/tbb/src/tbb/thread_dispatcher_client.h b/third-party/tbb/src/tbb/thread_dispatcher_client.h new file mode 100644 index 00000000..f7c199cb --- /dev/null +++ b/third-party/tbb/src/tbb/thread_dispatcher_client.h @@ -0,0 +1,69 @@ +/* + Copyright (c) 2022-2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_thread_dispatcher_client_H +#define _TBB_thread_dispatcher_client_H + +#include "oneapi/tbb/detail/_intrusive_list_node.h" +#include "arena.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class thread_dispatcher_client : public d1::intrusive_list_node /* Need for list in thread pool */ { +public: + thread_dispatcher_client(arena& a, std::uint64_t aba_epoch) : my_arena(a), my_aba_epoch(aba_epoch) {} + + // Interface of communication with thread pool + bool try_join() { + return my_arena.try_join(); + } + + bool is_joinable() { + return my_arena.is_joinable(); + } + + void process(thread_data& td) { + my_arena.process(td); + } + + unsigned priority_level() { + return my_arena.priority_level(); + } + + std::uint64_t get_aba_epoch() { + return my_aba_epoch; + } + + unsigned references() { + return my_arena.references(); + } + + bool has_request() { + return my_arena.has_request(); + } + +private: + arena& my_arena; + std::uint64_t my_aba_epoch; +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_thread_dispatcher_client_H diff --git a/third-party/tbb/src/tbb/thread_request_serializer.cpp b/third-party/tbb/src/tbb/thread_request_serializer.cpp new file mode 100644 index 00000000..6019f732 --- /dev/null +++ b/third-party/tbb/src/tbb/thread_request_serializer.cpp @@ -0,0 +1,140 @@ +/* + Copyright (c) 2022-2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "misc.h" +#include "thread_request_serializer.h" + +namespace tbb { +namespace detail { +namespace r1 { + +thread_request_serializer::thread_request_serializer(thread_dispatcher& td, int soft_limit) + : my_thread_dispatcher(td) + , my_soft_limit(soft_limit) +{} + +void thread_request_serializer::update(int delta) { + constexpr std::uint64_t delta_mask = (pending_delta_base << 1) - 1; + constexpr std::uint64_t counter_value = delta_mask + 1; + + int prev_pending_delta = my_pending_delta.fetch_add(counter_value + delta); + + // There is a pseudo request aggregator, so only thread that see pending_delta_base in my_pending_delta + // Will enter to critical section and call adjust_job_count_estimate + if (prev_pending_delta == pending_delta_base) { + delta = int(my_pending_delta.exchange(pending_delta_base) & delta_mask) - int(pending_delta_base); + mutex_type::scoped_lock lock(my_mutex); + my_total_request.store(my_total_request.load(std::memory_order_relaxed) + delta, std::memory_order_relaxed); + delta = limit_delta(delta, my_soft_limit, my_total_request.load(std::memory_order_relaxed)); + my_thread_dispatcher.adjust_job_count_estimate(delta); + } +} + +void thread_request_serializer::set_active_num_workers(int soft_limit) { + mutex_type::scoped_lock lock(my_mutex); + int delta = soft_limit - my_soft_limit; + delta = limit_delta(delta, my_total_request.load(std::memory_order_relaxed), soft_limit); + my_thread_dispatcher.adjust_job_count_estimate(delta); + my_soft_limit = soft_limit; +} + +int thread_request_serializer::limit_delta(int delta, int limit, int new_value) { + // This method can be described with such pseudocode: + // bool above_limit = prev_value >= limit && new_value >= limit; + // bool below_limit = prev_value <= limit && new_value <= limit; + // enum request_type { ABOVE_LIMIT, CROSS_LIMIT, BELOW_LIMIT }; + // request = above_limit ? ABOVE_LIMIT : below_limit ? BELOW_LIMIT : CROSS_LIMIT; + + // switch (request) { + // case ABOVE_LIMIT: + // delta = 0; + // case CROSS_LIMIT: + // delta = delta > 0 ? limit - prev_value : new_value - limit; + // case BELOW_LIMIT: + // // No changes to delta + // } + + int prev_value = new_value - delta; + + // actual new_value and prev_value cannot exceed the limit + new_value = min(limit, new_value); + prev_value = min(limit, prev_value); + return new_value - prev_value; +} + + +thread_request_serializer_proxy::thread_request_serializer_proxy(thread_dispatcher& td, int soft_limit) : my_serializer(td, soft_limit) +{} + +void thread_request_serializer_proxy::register_mandatory_request(int mandatory_delta) { + if (mandatory_delta != 0) { + mutex_type::scoped_lock lock(my_mutex, /* is_write = */ false); + int prev_value = my_num_mandatory_requests.fetch_add(mandatory_delta); + + const bool should_try_enable = mandatory_delta > 0 && prev_value == 0; + const bool should_try_disable = mandatory_delta < 0 && prev_value == 1; + + if (should_try_enable) { + enable_mandatory_concurrency(lock); + } else if (should_try_disable) { + disable_mandatory_concurrency(lock); + } + } +} + +void thread_request_serializer_proxy::set_active_num_workers(int soft_limit) { + mutex_type::scoped_lock lock(my_mutex, /* is_write = */ true); + + if (soft_limit != 0) { + my_is_mandatory_concurrency_enabled = false; + my_serializer.set_active_num_workers(soft_limit); + } else { + if (my_num_mandatory_requests > 0 && !my_is_mandatory_concurrency_enabled) { + my_is_mandatory_concurrency_enabled = true; + my_serializer.set_active_num_workers(1); + } + } +} + +int thread_request_serializer_proxy::num_workers_requested() { return my_serializer.num_workers_requested(); } + +void thread_request_serializer_proxy::update(int delta) { my_serializer.update(delta); } + +void thread_request_serializer_proxy::enable_mandatory_concurrency(mutex_type::scoped_lock& lock) { + lock.upgrade_to_writer(); + bool still_should_enable = my_num_mandatory_requests.load(std::memory_order_relaxed) > 0 && + !my_is_mandatory_concurrency_enabled && my_serializer.is_no_workers_avaliable(); + + if (still_should_enable) { + my_is_mandatory_concurrency_enabled = true; + my_serializer.set_active_num_workers(1); + } +} + +void thread_request_serializer_proxy::disable_mandatory_concurrency(mutex_type::scoped_lock& lock) { + lock.upgrade_to_writer(); + bool still_should_disable = my_num_mandatory_requests.load(std::memory_order_relaxed) <= 0 && + my_is_mandatory_concurrency_enabled && !my_serializer.is_no_workers_avaliable(); + + if (still_should_disable) { + my_is_mandatory_concurrency_enabled = false; + my_serializer.set_active_num_workers(0); + } +} + +} // r1 +} // detail +} // tbb diff --git a/third-party/tbb/src/tbb/thread_request_serializer.h b/third-party/tbb/src/tbb/thread_request_serializer.h new file mode 100644 index 00000000..9dc9799e --- /dev/null +++ b/third-party/tbb/src/tbb/thread_request_serializer.h @@ -0,0 +1,84 @@ +/* + Copyright (c) 2022-2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_thread_serializer_handlers_H +#define _TBB_thread_serializer_handlers_H + +#include "oneapi/tbb/mutex.h" +#include "oneapi/tbb/rw_mutex.h" + +#include "thread_dispatcher.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class thread_request_observer { +protected: + virtual ~thread_request_observer() {} +public: + virtual void update(int delta) = 0; +}; + + +class thread_request_serializer : public thread_request_observer { + using mutex_type = d1::mutex; +public: + thread_request_serializer(thread_dispatcher& td, int soft_limit); + void set_active_num_workers(int soft_limit); + int num_workers_requested() { return my_total_request.load(std::memory_order_relaxed); } + bool is_no_workers_avaliable() { return my_soft_limit == 0; } + +private: + friend class thread_request_serializer_proxy; + void update(int delta) override; + static int limit_delta(int delta, int limit, int new_value); + + thread_dispatcher& my_thread_dispatcher; + int my_soft_limit{ 0 }; + std::atomic my_total_request{ 0 }; + // my_pending_delta is set to pending_delta_base to have ability to hold negative values + // consider increase base since thead number will be bigger than 1 << 15 + static constexpr std::uint64_t pending_delta_base = 1 << 15; + std::atomic my_pending_delta{ pending_delta_base }; + mutex_type my_mutex; +}; + +// Handles mandatory concurrency i.e. enables worker threads for enqueue tasks +class thread_request_serializer_proxy : public thread_request_observer { + using mutex_type = d1::rw_mutex; +public: + thread_request_serializer_proxy(thread_dispatcher& td, int soft_limit); + void register_mandatory_request(int mandatory_delta); + void set_active_num_workers(int soft_limit); + int num_workers_requested(); + +private: + void update(int delta) override; + void enable_mandatory_concurrency(mutex_type::scoped_lock& lock); + void disable_mandatory_concurrency(mutex_type::scoped_lock& lock); + + std::atomic my_num_mandatory_requests{0}; + bool my_is_mandatory_concurrency_enabled{false}; + thread_request_serializer my_serializer; + mutex_type my_mutex; +}; + +} // namespace r1 +} // namespace detail +} // namespace tbb + +#endif // _TBB_thread_serializer_handlers_H diff --git a/third-party/tbb/src/tbb/threading_control.cpp b/third-party/tbb/src/tbb/threading_control.cpp new file mode 100644 index 00000000..1ca18378 --- /dev/null +++ b/third-party/tbb/src/tbb/threading_control.cpp @@ -0,0 +1,406 @@ +/* + Copyright (c) 2022-2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "threading_control.h" +#include "permit_manager.h" +#include "market.h" +#include "tcm_adaptor.h" +#include "thread_dispatcher.h" +#include "governor.h" +#include "thread_dispatcher_client.h" + +namespace tbb { +namespace detail { +namespace r1 { + +// ---------------------------------------- threading_control_impl -------------------------------------------------------------- + +std::size_t global_control_active_value_unsafe(d1::global_control::parameter); + +std::pair threading_control_impl::calculate_workers_limits() { + // Expecting that 4P is suitable for most applications. + // Limit to 2P for large thread number. + // TODO: ask RML for max concurrency and possibly correct hard_limit + unsigned factor = governor::default_num_threads() <= 128 ? 4 : 2; + + // The requested number of threads is intentionally not considered in + // computation of the hard limit, in order to separate responsibilities + // and avoid complicated interactions between global_control and task_scheduler_init. + // The threading control guarantees that at least 256 threads might be created. + unsigned workers_app_limit = global_control_active_value_unsafe(global_control::max_allowed_parallelism); + unsigned workers_hard_limit = max(max(factor * governor::default_num_threads(), 256u), workers_app_limit); + unsigned workers_soft_limit = calc_workers_soft_limit(workers_hard_limit); + + return std::make_pair(workers_soft_limit, workers_hard_limit); +} + +unsigned threading_control_impl::calc_workers_soft_limit(unsigned workers_hard_limit) { + unsigned workers_soft_limit{}; + unsigned soft_limit = global_control_active_value_unsafe(global_control::max_allowed_parallelism); + + // if user set no limits (yet), use default value + workers_soft_limit = soft_limit != 0 ? soft_limit - 1 : governor::default_num_threads() - 1; + + if (workers_soft_limit >= workers_hard_limit) { + workers_soft_limit = workers_hard_limit - 1; + } + + return workers_soft_limit; +} + +cache_aligned_unique_ptr threading_control_impl::make_permit_manager(unsigned workers_soft_limit) { + if (tcm_adaptor::is_initialized()) { + auto tcm = make_cache_aligned_unique(); + if (tcm->is_connected()) { + return tcm; + } + } + return make_cache_aligned_unique(workers_soft_limit); +} + +cache_aligned_unique_ptr threading_control_impl::make_thread_dispatcher(threading_control& tc, + unsigned workers_soft_limit, + unsigned workers_hard_limit) +{ + stack_size_type stack_size = global_control_active_value_unsafe(global_control::thread_stack_size); + + cache_aligned_unique_ptr td = + make_cache_aligned_unique(tc, workers_hard_limit, stack_size); + // This check relies on the fact that for shared RML default_concurrency == max_concurrency + if (!governor::UsePrivateRML && td->my_server->default_concurrency() < workers_soft_limit) { + runtime_warning("RML might limit the number of workers to %u while %u is requested.\n", + td->my_server->default_concurrency(), workers_soft_limit); + } + + return td; +} + +threading_control_impl::threading_control_impl(threading_control* tc) { + unsigned workers_soft_limit{}, workers_hard_limit{}; + std::tie(workers_soft_limit, workers_hard_limit) = calculate_workers_limits(); + + my_permit_manager = make_permit_manager(workers_soft_limit); + my_thread_dispatcher = make_thread_dispatcher(*tc, workers_soft_limit, workers_hard_limit); + my_thread_request_serializer = + make_cache_aligned_unique(*my_thread_dispatcher, workers_soft_limit); + my_permit_manager->set_thread_request_observer(*my_thread_request_serializer); + + my_cancellation_disseminator = make_cache_aligned_unique(); + my_waiting_threads_monitor = make_cache_aligned_unique(); +} + +void threading_control_impl::release(bool blocking_terminate) { + my_thread_dispatcher->release(blocking_terminate); +} + +void threading_control_impl::set_active_num_workers(unsigned soft_limit) { + __TBB_ASSERT(soft_limit <= my_thread_dispatcher->my_num_workers_hard_limit, nullptr); + my_thread_request_serializer->set_active_num_workers(soft_limit); + my_permit_manager->set_active_num_workers(soft_limit); +} + +threading_control_client threading_control_impl::create_client(arena& a) { + pm_client* pm_client = my_permit_manager->create_client(a); + thread_dispatcher_client* td_client = my_thread_dispatcher->create_client(a); + + return threading_control_client{pm_client, td_client}; +} + +threading_control_impl::client_snapshot threading_control_impl::prepare_client_destruction(threading_control_client client) { + auto td_client = client.get_thread_dispatcher_client(); + return {td_client->get_aba_epoch(), td_client->priority_level(), td_client, client.get_pm_client()}; +} + +bool threading_control_impl::try_destroy_client(threading_control_impl::client_snapshot snapshot) { + if (my_thread_dispatcher->try_unregister_client(snapshot.my_td_client, snapshot.aba_epoch, snapshot.priority_level)) { + my_permit_manager->unregister_and_destroy_client(*snapshot.my_pm_client); + return true; + } + return false; +} + +void threading_control_impl::publish_client(threading_control_client tc_client, d1::constraints& constraints) { + my_permit_manager->register_client(tc_client.get_pm_client(), constraints); + my_thread_dispatcher->register_client(tc_client.get_thread_dispatcher_client()); +} + +void threading_control_impl::register_thread(thread_data& td) { + my_cancellation_disseminator->register_thread(td); +} +void threading_control_impl::unregister_thread(thread_data& td) { + my_cancellation_disseminator->unregister_thread(td); +} + +void threading_control_impl::propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, + d1::task_group_context& src, uint32_t new_state) +{ + my_cancellation_disseminator->propagate_task_group_state(mptr_state, src, new_state); +} + +std::size_t threading_control_impl::worker_stack_size() { + return my_thread_dispatcher->worker_stack_size(); +} + +unsigned threading_control_impl::max_num_workers() { + return my_thread_dispatcher->my_num_workers_hard_limit; +} + +void threading_control_impl::adjust_demand(threading_control_client tc_client, int mandatory_delta, int workers_delta) { + auto& c = *tc_client.get_pm_client(); + my_thread_request_serializer->register_mandatory_request(mandatory_delta); + my_permit_manager->adjust_demand(c, mandatory_delta, workers_delta); +} + +bool threading_control_impl::is_any_other_client_active() { + return my_thread_request_serializer->num_workers_requested() > 0 ? my_thread_dispatcher->is_any_client_in_need() : false; +} + +thread_control_monitor& threading_control_impl::get_waiting_threads_monitor() { + return *my_waiting_threads_monitor; +} + +// ---------------------------------------- threading_control ------------------------------------------------------------------- + +// Defined in global_control.cpp +void global_control_lock(); +void global_control_unlock(); + +void threading_control::add_ref(bool is_public) { + ++my_ref_count; + if (is_public) { + my_public_ref_count++; + } +} + +bool threading_control::remove_ref(bool is_public) { + if (is_public) { + __TBB_ASSERT(g_threading_control == this, "Global threading control instance was destroyed prematurely?"); + __TBB_ASSERT(my_public_ref_count.load(std::memory_order_relaxed), nullptr); + --my_public_ref_count; + } + + bool is_last_ref = --my_ref_count == 0; + if (is_last_ref) { + __TBB_ASSERT(!my_public_ref_count.load(std::memory_order_relaxed), nullptr); + g_threading_control = nullptr; + } + + return is_last_ref; +} + +threading_control* threading_control::get_threading_control(bool is_public) { + threading_control* control = g_threading_control; + if (control) { + control->add_ref(is_public); + } + + return control; +} + +threading_control* threading_control::create_threading_control() { + // Global control should be locked before threading_control_impl + global_control_lock(); + + threading_control* thr_control{ nullptr }; + try_call([&] { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + + thr_control = get_threading_control(/*public = */ true); + if (thr_control == nullptr) { + thr_control = new (cache_aligned_allocate(sizeof(threading_control))) threading_control(/*public_ref = */ 1, /*private_ref = */ 1); + thr_control->my_pimpl = make_cache_aligned_unique(thr_control); + + __TBB_InitOnce::add_ref(); + + if (global_control_active_value_unsafe(global_control::scheduler_handle)) { + ++thr_control->my_public_ref_count; + ++thr_control->my_ref_count; + } + + g_threading_control = thr_control; + } + }).on_exception([&] { + global_control_unlock(); + + cache_aligned_deleter deleter{}; + deleter(thr_control); + }); + + global_control_unlock(); + return thr_control; +} + +void threading_control::destroy () { + cache_aligned_deleter deleter; + deleter(this); + __TBB_InitOnce::remove_ref(); +} + +void threading_control::wait_last_reference(global_mutex_type::scoped_lock& lock) { + while (my_public_ref_count.load(std::memory_order_relaxed) == 1 && my_ref_count.load(std::memory_order_relaxed) > 1) { + lock.release(); + // To guarantee that request_close_connection() is called by the last external thread, we need to wait till all + // references are released. Re-read my_public_ref_count to limit waiting if new external threads are created. + // Theoretically, new private references to the threading control can be added during waiting making it potentially + // endless. + // TODO: revise why the weak scheduler needs threading control's pointer and try to remove this wait. + // Note that the threading control should know about its schedulers for cancellation/exception/priority propagation, + // see e.g. task_group_context::cancel_group_execution() + while (my_public_ref_count.load(std::memory_order_acquire) == 1 && my_ref_count.load(std::memory_order_acquire) > 1) { + yield(); + } + lock.acquire(g_threading_control_mutex); + } +} + +bool threading_control::release(bool is_public, bool blocking_terminate) { + bool do_release = false; + { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + if (blocking_terminate) { + __TBB_ASSERT(is_public, "Only an object with a public reference can request the blocking terminate"); + wait_last_reference(lock); + } + do_release = remove_ref(is_public); + } + + if (do_release) { + __TBB_ASSERT(!my_public_ref_count.load(std::memory_order_relaxed), "No public references must remain if we remove the threading control."); + // inform RML that blocking termination is required + my_pimpl->release(blocking_terminate); + return blocking_terminate; + } + return false; +} + +threading_control::threading_control(unsigned public_ref, unsigned ref) : my_public_ref_count(public_ref), my_ref_count(ref) +{} + +threading_control* threading_control::register_public_reference() { + threading_control* control{nullptr}; + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + control = get_threading_control(/*public = */ true); + if (!control) { + // We are going to create threading_control_impl, we should acquire mutexes in right order + lock.release(); + control = create_threading_control(); + } + + return control; +} + +bool threading_control::unregister_public_reference(bool blocking_terminate) { + __TBB_ASSERT(g_threading_control, "Threading control should exist until last public reference"); + __TBB_ASSERT(g_threading_control->my_public_ref_count.load(std::memory_order_relaxed), nullptr); + return g_threading_control->release(/*public = */ true, /*blocking_terminate = */ blocking_terminate); +} + +threading_control_client threading_control::create_client(arena& a) { + { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + add_ref(/*public = */ false); + } + + return my_pimpl->create_client(a); +} + +void threading_control::publish_client(threading_control_client client, d1::constraints& constraints) { + return my_pimpl->publish_client(client, constraints); +} + +threading_control::client_snapshot threading_control::prepare_client_destruction(threading_control_client client) { + return my_pimpl->prepare_client_destruction(client); +} + +bool threading_control::try_destroy_client(threading_control::client_snapshot deleter) { + bool res = my_pimpl->try_destroy_client(deleter); + if (res) { + release(/*public = */ false, /*blocking_terminate = */ false); + } + return res; +} + +void threading_control::set_active_num_workers(unsigned soft_limit) { + threading_control* thr_control = get_threading_control(/*public = */ false); + if (thr_control != nullptr) { + thr_control->my_pimpl->set_active_num_workers(soft_limit); + thr_control->release(/*is_public=*/false, /*blocking_terminate=*/false); + } +} + +bool threading_control::is_present() { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + return g_threading_control != nullptr; +} + +bool threading_control::register_lifetime_control() { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + return get_threading_control(/*public = */ true) != nullptr; +} + +bool threading_control::unregister_lifetime_control(bool blocking_terminate) { + threading_control* thr_control{nullptr}; + { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + thr_control = g_threading_control; + } + + bool released{true}; + if (thr_control) { + released = thr_control->release(/*public = */ true, /*blocking_terminate = */ blocking_terminate); + } + + return released; +} + +void threading_control::register_thread(thread_data& td) { + my_pimpl->register_thread(td); +} + +void threading_control::unregister_thread(thread_data& td) { + my_pimpl->unregister_thread(td); +} + +void threading_control::propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, + d1::task_group_context& src, uint32_t new_state) +{ + my_pimpl->propagate_task_group_state(mptr_state, src, new_state); +} + +std::size_t threading_control::worker_stack_size() { + return my_pimpl->worker_stack_size(); +} + +unsigned threading_control::max_num_workers() { + global_mutex_type::scoped_lock lock(g_threading_control_mutex); + return g_threading_control ? g_threading_control->my_pimpl->max_num_workers() : 0; +} + +void threading_control::adjust_demand(threading_control_client client, int mandatory_delta, int workers_delta) { + my_pimpl->adjust_demand(client, mandatory_delta, workers_delta); +} + +bool threading_control::is_any_other_client_active() { + return my_pimpl->is_any_other_client_active(); +} + +thread_control_monitor& threading_control::get_waiting_threads_monitor() { + return my_pimpl->get_waiting_threads_monitor(); +} + +} // r1 +} // detail +} // tbb diff --git a/third-party/tbb/src/tbb/threading_control.h b/third-party/tbb/src/tbb/threading_control.h new file mode 100644 index 00000000..7381b297 --- /dev/null +++ b/third-party/tbb/src/tbb/threading_control.h @@ -0,0 +1,154 @@ +/* + Copyright (c) 2022-2024 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_threading_control_H +#define _TBB_threading_control_H + +#include "oneapi/tbb/mutex.h" +#include "oneapi/tbb/global_control.h" + +#include "threading_control_client.h" +#include "intrusive_list.h" +#include "main.h" +#include "permit_manager.h" +#include "pm_client.h" +#include "thread_dispatcher.h" +#include "cancellation_disseminator.h" +#include "thread_request_serializer.h" +#include "scheduler_common.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class arena; +class thread_data; + +class threading_control; + +class threading_control_impl { +public: + threading_control_impl(threading_control*); + +public: + void release(bool blocking_terminate); + + threading_control_client create_client(arena& a); + void publish_client(threading_control_client client, d1::constraints& constraints); + + struct client_snapshot { + std::uint64_t aba_epoch; + unsigned priority_level; + thread_dispatcher_client* my_td_client; + pm_client* my_pm_client; + }; + + client_snapshot prepare_client_destruction(threading_control_client client); + bool try_destroy_client(client_snapshot deleter); + + void register_thread(thread_data& td); + void unregister_thread(thread_data& td); + void propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, + d1::task_group_context& src, uint32_t new_state); + + void set_active_num_workers(unsigned soft_limit); + std::size_t worker_stack_size(); + unsigned max_num_workers(); + + void adjust_demand(threading_control_client, int mandatory_delta, int workers_delta); + bool is_any_other_client_active(); + + thread_control_monitor& get_waiting_threads_monitor(); + +private: + static unsigned calc_workers_soft_limit(unsigned workers_hard_limit); + static std::pair calculate_workers_limits(); + static cache_aligned_unique_ptr make_permit_manager(unsigned workers_soft_limit); + static cache_aligned_unique_ptr make_thread_dispatcher(threading_control& control, + unsigned workers_soft_limit, + unsigned workers_hard_limit); + + // TODO: Consider allocation one chunk of memory and construct objects on it + cache_aligned_unique_ptr my_permit_manager{nullptr}; + cache_aligned_unique_ptr my_thread_dispatcher{nullptr}; + cache_aligned_unique_ptr my_thread_request_serializer{nullptr}; + cache_aligned_unique_ptr my_cancellation_disseminator{nullptr}; + cache_aligned_unique_ptr my_waiting_threads_monitor{nullptr}; +}; + + +class threading_control { + using global_mutex_type = d1::mutex; +public: + using client_snapshot = threading_control_impl::client_snapshot; + + static threading_control* register_public_reference(); + static bool unregister_public_reference(bool blocking_terminate); + + static bool is_present(); + static void set_active_num_workers(unsigned soft_limit); + static bool register_lifetime_control(); + static bool unregister_lifetime_control(bool blocking_terminate); + + threading_control_client create_client(arena& a); + void publish_client(threading_control_client client, d1::constraints& constraints); + client_snapshot prepare_client_destruction(threading_control_client client); + bool try_destroy_client(client_snapshot deleter); + + void register_thread(thread_data& td); + void unregister_thread(thread_data& td); + void propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, + d1::task_group_context& src, uint32_t new_state); + + std::size_t worker_stack_size(); + static unsigned max_num_workers(); + + void adjust_demand(threading_control_client client, int mandatory_delta, int workers_delta); + bool is_any_other_client_active(); + + thread_control_monitor& get_waiting_threads_monitor(); + +private: + threading_control(unsigned public_ref, unsigned ref); + void add_ref(bool is_public); + bool remove_ref(bool is_public); + + static threading_control* get_threading_control(bool is_public); + static threading_control* create_threading_control(); + + bool release(bool is_public, bool blocking_terminate); + void wait_last_reference(global_mutex_type::scoped_lock& lock); + void destroy(); + + friend class thread_dispatcher; + + static threading_control* g_threading_control; + //! Mutex guarding creation/destruction of g_threading_control, insertions/deletions in my_arenas, and cancellation propagation + static global_mutex_type g_threading_control_mutex; + + cache_aligned_unique_ptr my_pimpl{nullptr}; + //! Count of external threads attached + std::atomic my_public_ref_count{0}; + //! Reference count controlling threading_control object lifetime + std::atomic my_ref_count{0}; +}; + +} // r1 +} // detail +} // tbb + + +#endif // _TBB_threading_control_H diff --git a/third-party/tbb/src/tbb/threading_control_client.h b/third-party/tbb/src/tbb/threading_control_client.h new file mode 100644 index 00000000..4ff9359c --- /dev/null +++ b/third-party/tbb/src/tbb/threading_control_client.h @@ -0,0 +1,58 @@ +/* + Copyright (c) 2022-2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef _TBB_threading_control_client_H +#define _TBB_threading_control_client_H + +#include "oneapi/tbb/detail/_assert.h" + +namespace tbb { +namespace detail { +namespace r1 { + +class pm_client; +class thread_dispatcher_client; + +class threading_control_client { +public: + threading_control_client() = default; + threading_control_client(const threading_control_client&) = default; + threading_control_client& operator=(const threading_control_client&) = default; + + threading_control_client(pm_client* p, thread_dispatcher_client* t) : my_pm_client(p), my_thread_dispatcher_client(t) { + __TBB_ASSERT(my_pm_client, nullptr); + __TBB_ASSERT(my_thread_dispatcher_client, nullptr); + } + + pm_client* get_pm_client() { + return my_pm_client; + } + + thread_dispatcher_client* get_thread_dispatcher_client() { + return my_thread_dispatcher_client; + } + +private: + pm_client* my_pm_client{nullptr}; + thread_dispatcher_client* my_thread_dispatcher_client{nullptr}; +}; + + +} +} +} + +#endif // _TBB_threading_control_client_H diff --git a/third-party/tbb/src/tbb/tools_api/ittnotify.h b/third-party/tbb/src/tbb/tools_api/ittnotify.h index e701980f..eb1571dc 100644 --- a/third-party/tbb/src/tbb/tools_api/ittnotify.h +++ b/third-party/tbb/src/tbb/tools_api/ittnotify.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -101,6 +101,11 @@ The same ID may not be reused for different instances, unless a previous # define ITT_OS_FREEBSD 4 #endif /* ITT_OS_FREEBSD */ +#ifndef ITT_OS_OPENBSD +# define ITT_OS_OPENBSD 5 +#endif /* ITT_OS_OPENBSD */ + + #ifndef ITT_OS # if defined WIN32 || defined _WIN32 # define ITT_OS ITT_OS_WIN @@ -108,6 +113,8 @@ The same ID may not be reused for different instances, unless a previous # define ITT_OS ITT_OS_MAC # elif defined( __FreeBSD__ ) # define ITT_OS ITT_OS_FREEBSD +# elif defined( __OpenBSD__ ) +# define ITT_OS ITT_OS_OPENBSD # else # define ITT_OS ITT_OS_LINUX # endif @@ -129,6 +136,10 @@ The same ID may not be reused for different instances, unless a previous # define ITT_PLATFORM_FREEBSD 4 #endif /* ITT_PLATFORM_FREEBSD */ +#ifndef ITT_PLATFORM_OPENBSD +# define ITT_PLATFORM_OPENBSD 5 +#endif /* ITT_PLATFORM_OPENBSD */ + #ifndef ITT_PLATFORM # if ITT_OS==ITT_OS_WIN # define ITT_PLATFORM ITT_PLATFORM_WIN @@ -136,6 +147,8 @@ The same ID may not be reused for different instances, unless a previous # define ITT_PLATFORM ITT_PLATFORM_MAC # elif ITT_OS==ITT_OS_FREEBSD # define ITT_PLATFORM ITT_PLATFORM_FREEBSD +# elif ITT_OS==ITT_OS_OPENBSD +# define ITT_PLATFORM ITT_PLATFORM_OPENBSD # else # define ITT_PLATFORM ITT_PLATFORM_POSIX # endif @@ -305,7 +318,7 @@ extern "C" { * only pauses tracing and analyzing memory access. * It does not pause tracing or analyzing threading APIs. * . - * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE: + * Intel(R) VTune(TM) Profiler: * - Does continue to record when new threads are started. * . * - Other effects: @@ -320,30 +333,57 @@ void ITTAPI __itt_resume(void); /** @brief Detach collection */ void ITTAPI __itt_detach(void); +/** + * @enum __itt_collection_scope + * @brief Enumerator for collection scopes + */ +typedef enum { + __itt_collection_scope_host = 1 << 0, + __itt_collection_scope_offload = 1 << 1, + __itt_collection_scope_all = 0x7FFFFFFF +} __itt_collection_scope; + +/** @brief Pause scoped collection */ +void ITTAPI __itt_pause_scoped(__itt_collection_scope); +/** @brief Resume scoped collection */ +void ITTAPI __itt_resume_scoped(__itt_collection_scope); + /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API -ITT_STUBV(ITTAPI, void, pause, (void)) -ITT_STUBV(ITTAPI, void, resume, (void)) -ITT_STUBV(ITTAPI, void, detach, (void)) -#define __itt_pause ITTNOTIFY_VOID(pause) -#define __itt_pause_ptr ITTNOTIFY_NAME(pause) -#define __itt_resume ITTNOTIFY_VOID(resume) -#define __itt_resume_ptr ITTNOTIFY_NAME(resume) -#define __itt_detach ITTNOTIFY_VOID(detach) -#define __itt_detach_ptr ITTNOTIFY_NAME(detach) +ITT_STUBV(ITTAPI, void, pause, (void)) +ITT_STUBV(ITTAPI, void, pause_scoped, (__itt_collection_scope)) +ITT_STUBV(ITTAPI, void, resume, (void)) +ITT_STUBV(ITTAPI, void, resume_scoped, (__itt_collection_scope)) +ITT_STUBV(ITTAPI, void, detach, (void)) +#define __itt_pause ITTNOTIFY_VOID(pause) +#define __itt_pause_ptr ITTNOTIFY_NAME(pause) +#define __itt_pause_scoped ITTNOTIFY_VOID(pause_scoped) +#define __itt_pause_scoped_ptr ITTNOTIFY_NAME(pause_scoped) +#define __itt_resume ITTNOTIFY_VOID(resume) +#define __itt_resume_ptr ITTNOTIFY_NAME(resume) +#define __itt_resume_scoped ITTNOTIFY_VOID(resume_scoped) +#define __itt_resume_scoped_ptr ITTNOTIFY_NAME(resume_scoped) +#define __itt_detach ITTNOTIFY_VOID(detach) +#define __itt_detach_ptr ITTNOTIFY_NAME(detach) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_pause() -#define __itt_pause_ptr 0 +#define __itt_pause_ptr 0 +#define __itt_pause_scoped(scope) +#define __itt_pause_scoped_ptr 0 #define __itt_resume() -#define __itt_resume_ptr 0 +#define __itt_resume_ptr 0 +#define __itt_resume_scoped(scope) +#define __itt_resume_scoped_ptr 0 #define __itt_detach() -#define __itt_detach_ptr 0 +#define __itt_detach_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ -#define __itt_pause_ptr 0 -#define __itt_resume_ptr 0 -#define __itt_detach_ptr 0 +#define __itt_pause_ptr 0 +#define __itt_pause_scoped_ptr 0 +#define __itt_resume_ptr 0 +#define __itt_resume_scoped_ptr 0 +#define __itt_detach_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} control group */ @@ -353,7 +393,7 @@ ITT_STUBV(ITTAPI, void, detach, (void)) * @defgroup Intel Processor Trace control * API from this group provides control over collection and analysis of Intel Processor Trace (Intel PT) data * Information about Intel Processor Trace technology can be found here (Volume 3 chapter 35): - * https://software.intel.com/sites/default/files/managed/39/c5/325462-sdm-vol-1-2abcd-3abcd.pdf + * https://github.com/tpn/pdfs/blob/master/Intel%2064%20and%20IA-32%20Architectures%20Software%20Developer's%20Manual%20-%20Combined%20Volumes%201-4%20-%20May%202018%20(325462-sdm-vol-1-2abcd-3abcd).pdf * Use this API to mark particular code regions for loading detailed performance statistics. * This mode makes your analysis faster and more accurate. * @{ @@ -587,8 +627,8 @@ ITT_STUBV(ITTAPI, void, suppress_pop, (void)) /** @endcond */ /** - * @enum __itt_model_disable - * @brief Enumerator for the disable methods + * @enum __itt_suppress_mode + * @brief Enumerator for the suppressing modes */ typedef enum __itt_suppress_mode { __itt_unsuppress_range, @@ -597,12 +637,12 @@ typedef enum __itt_suppress_mode { /** * @enum __itt_collection_state - * @brief Enumerator for collection state. All non-work states have negative values. + * @brief Enumerator for collection state. */ typedef enum { __itt_collection_uninitialized = 0, /* uninitialized */ __itt_collection_init_fail = 1, /* failed to init */ - __itt_collection_collector_absent = 2, /* non work state collector exists */ + __itt_collection_collector_absent = 2, /* non work state collector is absent */ __itt_collection_collector_exists = 3, /* work state collector exists */ __itt_collection_init_successful = 4 /* success to init */ } __itt_collection_state; @@ -2345,7 +2385,7 @@ ITT_STUBV(ITTAPI, void, task_end_overlapped, (const __itt_domain *domain, __it /** * @defgroup markers Markers - * Markers represent a single discreet event in time. Markers have a scope, + * Markers represent a single discrete event in time. Markers have a scope, * described by an enumerated type __itt_scope. Markers are created by * the API call __itt_marker. A marker instance can be given an ID for use in * adding metadata. @@ -4005,6 +4045,173 @@ __itt_collection_state __itt_get_collection_state(void); void __itt_release_resources(void); /** @endcond */ +/** + * @brief Create a typed counter with given domain pointer, string name and counter type +*/ +#if ITT_PLATFORM==ITT_PLATFORM_WIN +__itt_counter ITTAPI __itt_counter_createA_v3(const __itt_domain* domain, const char* name, __itt_metadata_type type); +__itt_counter ITTAPI __itt_counter_createW_v3(const __itt_domain* domain, const wchar_t* name, __itt_metadata_type type); +#if defined(UNICODE) || defined(_UNICODE) +# define __itt_counter_create_v3 __itt_counter_createW_v3 +# define __itt_counter_create_v3_ptr __itt_counter_createW_v3_ptr +#else /* UNICODE */ +# define __itt_counter_create_v3 __itt_counter_createA_v3 +# define __itt_counter_create_v3_ptr __itt_counter_createA_v3_ptr +#endif /* UNICODE */ +#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ +__itt_counter ITTAPI __itt_counter_create_v3(const __itt_domain* domain, const char* name, __itt_metadata_type type); +#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ + +#ifndef INTEL_NO_MACRO_BODY +#ifndef INTEL_NO_ITTNOTIFY_API +#if ITT_PLATFORM==ITT_PLATFORM_WIN +ITT_STUB(ITTAPI, __itt_counter, counter_createA_v3, (const __itt_domain* domain, const char* name, __itt_metadata_type type)) +ITT_STUB(ITTAPI, __itt_counter, counter_createW_v3, (const __itt_domain* domain, const wchar_t* name, __itt_metadata_type type)) +#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ +ITT_STUB(ITTAPI, __itt_counter, counter_create_v3, (const __itt_domain* domain, const char* name, __itt_metadata_type type)) +#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ +#if ITT_PLATFORM==ITT_PLATFORM_WIN +#define __itt_counter_createA_v3 ITTNOTIFY_DATA(counter_createA_v3) +#define __itt_counter_createA_v3_ptr ITTNOTIFY_NAME(counter_createA_v3) +#define __itt_counter_createW_v3 ITTNOTIFY_DATA(counter_createW_v3) +#define __itt_counter_createW_v3_ptr ITTNOTIFY_NAME(counter_createW_v3) +#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ +#define __itt_counter_create_v3 ITTNOTIFY_DATA(counter_create_v3) +#define __itt_counter_create_v3_ptr ITTNOTIFY_NAME(counter_create_v3) +#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ +#else /* INTEL_NO_ITTNOTIFY_API */ +#if ITT_PLATFORM==ITT_PLATFORM_WIN +#define __itt_counter_createA_v3(domain, name, type) (__itt_counter)0 +#define __itt_counter_createA_v3_ptr 0 +#define __itt_counter_createW_v3(domain, name, type) (__itt_counter)0 +#define __itt_counter_create_typedW_ptr 0 +#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ +#define __itt_counter_create_v3(domain, name, type) (__itt_counter)0 +#define __itt_counter_create_v3_ptr 0 +#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ +#endif /* INTEL_NO_ITTNOTIFY_API */ +#else /* INTEL_NO_MACRO_BODY */ +#if ITT_PLATFORM==ITT_PLATFORM_WIN +#define __itt_counter_createA_v3_ptr 0 +#define __itt_counter_createW_v3_ptr 0 +#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ +#define __itt_counter_create_v3_ptr 0 +#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ +#endif /* INTEL_NO_MACRO_BODY */ +/** @endcond */ + +/** + * @brief Set the counter value api + */ +void ITTAPI __itt_counter_set_value_v3(__itt_counter counter, void *value_ptr); + +#ifndef INTEL_NO_MACRO_BODY +#ifndef INTEL_NO_ITTNOTIFY_API +ITT_STUBV(ITTAPI, void, counter_set_value_v3, (__itt_counter counter, void *value_ptr)) +#define __itt_counter_set_value_v3 ITTNOTIFY_VOID(counter_set_value_v3) +#define __itt_counter_set_value_v3_ptr ITTNOTIFY_NAME(counter_set_value_v3) +#else /* INTEL_NO_ITTNOTIFY_API */ +#define __itt_counter_set_value_v3(counter, value_ptr) +#define __itt_counter_set_value_v3_ptr 0 +#endif /* INTEL_NO_ITTNOTIFY_API */ +#else /* INTEL_NO_MACRO_BODY */ +#define __itt_counter_set_value_v3_ptr 0 +#endif /* INTEL_NO_MACRO_BODY */ +/** @endcond */ + +/** + * @brief describes the type of context metadata +*/ +typedef enum { + __itt_context_unknown = 0, /*!< Undefined type */ + __itt_context_nameA, /*!< ASCII string char* type */ + __itt_context_nameW, /*!< Unicode string wchar_t* type */ + __itt_context_deviceA, /*!< ASCII string char* type */ + __itt_context_deviceW, /*!< Unicode string wchar_t* type */ + __itt_context_unitsA, /*!< ASCII string char* type */ + __itt_context_unitsW, /*!< Unicode string wchar_t* type */ + __itt_context_pci_addrA, /*!< ASCII string char* type */ + __itt_context_pci_addrW, /*!< Unicode string wchar_t* type */ + __itt_context_tid, /*!< Unsigned 64-bit integer type */ + __itt_context_max_val, /*!< Unsigned 64-bit integer type */ + __itt_context_bandwidth_flag, /*!< Unsigned 64-bit integer type */ + __itt_context_latency_flag, /*!< Unsigned 64-bit integer type */ + __itt_context_occupancy_flag, /*!< Unsigned 64-bit integer type */ + __itt_context_on_thread_flag, /*!< Unsigned 64-bit integer type */ + __itt_context_is_abs_val_flag, /*!< Unsigned 64-bit integer type */ + __itt_context_cpu_instructions_flag, /*!< Unsigned 64-bit integer type */ + __itt_context_cpu_cycles_flag /*!< Unsigned 64-bit integer type */ +} __itt_context_type; + +#if defined(UNICODE) || defined(_UNICODE) +# define __itt_context_name __itt_context_nameW +# define __itt_context_device __itt_context_deviceW +# define __itt_context_units __itt_context_unitsW +# define __itt_context_pci_addr __itt_context_pci_addrW +#else /* UNICODE || _UNICODE */ +# define __itt_context_name __itt_context_nameA +# define __itt_context_device __itt_context_deviceA +# define __itt_context_units __itt_context_unitsA +# define __itt_context_pci_addr __itt_context_pci_addrA +#endif /* UNICODE || _UNICODE */ + +/** @cond exclude_from_documentation */ +#pragma pack(push, 8) + +typedef struct ___itt_context_metadata +{ + __itt_context_type type; /*!< Type of the context metadata value */ + void* value; /*!< Pointer to context metadata value itself */ +} __itt_context_metadata; + +#pragma pack(pop) +/** @endcond */ + +/** @cond exclude_from_documentation */ +#pragma pack(push, 8) + +typedef struct ___itt_counter_metadata +{ + __itt_counter counter; /*!< Associated context metadata counter */ + __itt_context_type type; /*!< Type of the context metadata value */ + const char* str_valueA; /*!< String context metadata value */ +#if defined(UNICODE) || defined(_UNICODE) + const wchar_t* str_valueW; +#else /* UNICODE || _UNICODE */ + void* str_valueW; +#endif /* UNICODE || _UNICODE */ + unsigned long long value; /*!< Numeric context metadata value */ + int extra1; /*!< Reserved to the runtime */ + void* extra2; /*!< Reserved to the runtime */ + struct ___itt_counter_metadata* next; +} __itt_counter_metadata; + +#pragma pack(pop) +/** @endcond */ + +/** + * @brief Bind context metadata to counter instance + * @param[in] counter Pointer to the counter instance to which the context metadata is to be associated. + * @param[in] length The number of elements in context metadata array. + * @param[in] metadata The context metadata itself. +*/ +void ITTAPI __itt_bind_context_metadata_to_counter(__itt_counter counter, size_t length, __itt_context_metadata* metadata); + +/** @cond exclude_from_documentation */ +#ifndef INTEL_NO_MACRO_BODY +#ifndef INTEL_NO_ITTNOTIFY_API +ITT_STUBV(ITTAPI, void, bind_context_metadata_to_counter, (__itt_counter counter, size_t length, __itt_context_metadata* metadata)) +#define __itt_bind_context_metadata_to_counter ITTNOTIFY_VOID(bind_context_metadata_to_counter) +#define __itt_bind_context_metadata_to_counter_ptr ITTNOTIFY_NAME(bind_context_metadata_to_counter) +#else /* INTEL_NO_ITTNOTIFY_API */ +#define __itt_bind_context_metadata_to_counter(counter, length, metadata) +#define __itt_bind_context_metadata_to_counter_ptr 0 +#endif /* INTEL_NO_ITTNOTIFY_API */ +#else /* INTEL_NO_MACRO_BODY */ +#define __itt_bind_context_metadata_to_counter_ptr 0 +#endif /* INTEL_NO_MACRO_BODY */ +/** @endcond */ + #ifdef __cplusplus } #endif /* __cplusplus */ @@ -4423,7 +4630,7 @@ typedef enum __itt_error_code { __itt_error_success = 0, /*!< no error */ __itt_error_no_module = 1, /*!< module can't be loaded */ - /* %1$s -- library name; win: %2$d -- system error code; unx: %2$s -- system error message. */ + /* %1$s -- library name; win: %2$d -- system error code; unix: %2$s -- system error message. */ __itt_error_no_symbol = 2, /*!< symbol not found */ /* %1$s -- library name, %2$s -- symbol name. */ __itt_error_unknown_group = 3, /*!< unknown group specified */ diff --git a/third-party/tbb/src/tbb/tools_api/ittnotify_config.h b/third-party/tbb/src/tbb/tools_api/ittnotify_config.h index 0f5d80f6..001d42e0 100644 --- a/third-party/tbb/src/tbb/tools_api/ittnotify_config.h +++ b/third-party/tbb/src/tbb/tools_api/ittnotify_config.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2023 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -34,6 +34,10 @@ # define ITT_OS_FREEBSD 4 #endif /* ITT_OS_FREEBSD */ +#ifndef ITT_OS_OPENBSD +# define ITT_OS_OPENBSD 5 +#endif /* ITT_OS_OPENBSD */ + #ifndef ITT_OS # if defined WIN32 || defined _WIN32 # define ITT_OS ITT_OS_WIN @@ -41,6 +45,8 @@ # define ITT_OS ITT_OS_MAC # elif defined( __FreeBSD__ ) # define ITT_OS ITT_OS_FREEBSD +# elif defined( __OpenBSD__ ) +# define ITT_OS ITT_OS_OPENBSD # else # define ITT_OS ITT_OS_LINUX # endif @@ -62,6 +68,10 @@ # define ITT_PLATFORM_FREEBSD 4 #endif /* ITT_PLATFORM_FREEBSD */ +#ifndef ITT_PLATFORM_OPENBSD +# define ITT_PLATFORM_OPENBSD 5 +#endif /* ITT_PLATFORM_OPENBSD */ + #ifndef ITT_PLATFORM # if ITT_OS==ITT_OS_WIN # define ITT_PLATFORM ITT_PLATFORM_WIN @@ -69,6 +79,8 @@ # define ITT_PLATFORM ITT_PLATFORM_MAC # elif ITT_OS==ITT_OS_FREEBSD # define ITT_PLATFORM ITT_PLATFORM_FREEBSD +# elif ITT_OS==ITT_OS_OPENBSD +# define ITT_PLATFORM ITT_PLATFORM_OPENBSD # else # define ITT_PLATFORM ITT_PLATFORM_POSIX # endif @@ -232,10 +244,10 @@ #define ITT_MAGIC { 0xED, 0xAB, 0xAB, 0xEC, 0x0D, 0xEE, 0xDA, 0x30 } /* Replace with snapshot date YYYYMMDD for promotion build. */ -#define API_VERSION_BUILD 20180723 +#define API_VERSION_BUILD 20230630 #ifndef API_VERSION_NUM -#define API_VERSION_NUM 3.23.0 +#define API_VERSION_NUM 3.24.4 #endif /* API_VERSION_NUM */ #define API_VERSION "ITT-API-Version " ITT_TO_STR(API_VERSION_NUM) \ @@ -494,6 +506,7 @@ typedef struct __itt_counter_info struct ___itt_domain; struct ___itt_string_handle; struct ___itt_histogram; +struct ___itt_counter_metadata; #include "ittnotify.h" @@ -520,6 +533,7 @@ typedef struct ___itt_global __itt_counter_info_t* counter_list; unsigned int ipt_collect_events; struct ___itt_histogram* histogram_list; + struct ___itt_counter_metadata* counter_metadata_list; } __itt_global; #pragma pack(pop) @@ -632,7 +646,7 @@ typedef struct ___itt_global h->nameA = NULL; \ h->nameW = name ? _wcsdup(name) : NULL; \ h->domainA = NULL; \ - h->domainW = name ? _wcsdup(domain) : NULL; \ + h->domainW = domain ? _wcsdup(domain) : NULL; \ h->type = type; \ h->index = 0; \ h->next = NULL; \ @@ -674,6 +688,7 @@ typedef struct ___itt_global h->y_type = y_type; \ h->extra1 = 0; \ h->extra2 = NULL; \ + h->next = NULL; \ if (h_tail == NULL) \ (gptr)->histogram_list = h; \ else \ @@ -693,6 +708,7 @@ typedef struct ___itt_global h->y_type = y_type; \ h->extra1 = 0; \ h->extra2 = NULL; \ + h->next = NULL; \ if (h_tail == NULL) \ (gptr)->histogram_list = h; \ else \ @@ -700,4 +716,60 @@ typedef struct ___itt_global } \ } +#define NEW_COUNTER_METADATA_NUM(gptr,h,h_tail,counter,type,value) { \ + h = (__itt_counter_metadata*)malloc(sizeof(__itt_counter_metadata)); \ + if (h != NULL) { \ + h->counter = counter; \ + h->type = type; \ + h->str_valueA = NULL; \ + h->str_valueW = NULL; \ + h->value = value; \ + h->extra1 = 0; \ + h->extra2 = NULL; \ + h->next = NULL; \ + if (h_tail == NULL) \ + (gptr)->counter_metadata_list = h; \ + else \ + h_tail->next = h; \ + } \ +} + +#define NEW_COUNTER_METADATA_STR_A(gptr,h,h_tail,counter,type,str_valueA) { \ + h = (__itt_counter_metadata*)malloc(sizeof(__itt_counter_metadata)); \ + if (h != NULL) { \ + h->counter = counter; \ + h->type = type; \ + char *str_value_copy = NULL; \ + __itt_fstrdup(str_valueA, str_value_copy); \ + h->str_valueA = str_value_copy; \ + h->str_valueW = NULL; \ + h->value = 0; \ + h->extra1 = 0; \ + h->extra2 = NULL; \ + h->next = NULL; \ + if (h_tail == NULL) \ + (gptr)->counter_metadata_list = h; \ + else \ + h_tail->next = h; \ + } \ +} + +#define NEW_COUNTER_METADATA_STR_W(gptr,h,h_tail,counter,type,str_valueW) { \ + h = (__itt_counter_metadata*)malloc(sizeof(__itt_counter_metadata)); \ + if (h != NULL) { \ + h->counter = counter; \ + h->type = type; \ + h->str_valueA = NULL; \ + h->str_valueW = str_valueW ? _wcsdup(str_valueW) : NULL; \ + h->value = 0; \ + h->extra1 = 0; \ + h->extra2 = NULL; \ + h->next = NULL; \ + if (h_tail == NULL) \ + (gptr)->counter_metadata_list = h; \ + else \ + h_tail->next = h; \ + } \ +} + #endif /* _ITTNOTIFY_CONFIG_H_ */ diff --git a/third-party/tbb/src/tbb/tools_api/ittnotify_static.c b/third-party/tbb/src/tbb/tools_api/ittnotify_static.c index 0b9aa492..c3a53bf0 100644 --- a/third-party/tbb/src/tbb/tools_api/ittnotify_static.c +++ b/third-party/tbb/src/tbb/tools_api/ittnotify_static.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -81,7 +81,7 @@ static const char api_version[] = API_VERSION "\0\n@(#) $Revision$\n"; #if ITT_OS==ITT_OS_WIN static const char* ittnotify_lib_name = "libittnotify.dll"; -#elif ITT_OS==ITT_OS_LINUX || ITT_OS==ITT_OS_FREEBSD +#elif ITT_OS==ITT_OS_LINUX || ITT_OS==ITT_OS_FREEBSD|| ITT_OS==ITT_OS_OPENBSD static const char* ittnotify_lib_name = "libittnotify.so"; #elif ITT_OS==ITT_OS_MAC static const char* ittnotify_lib_name = "libittnotify.dylib"; @@ -305,7 +305,8 @@ __itt_global _N_(_ittapi_global) = { __itt_collection_uninitialized, /* collection state */ NULL, /* counter_list */ 0, /* ipt_collect_events */ - NULL /* histogram_list */ + NULL, /* histogram_list */ + NULL /* counter_metadata_list */ }; typedef void (__itt_api_init_t)(__itt_global*, __itt_group_id); @@ -850,6 +851,169 @@ static __itt_histogram* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(histogram_create),_in return (__itt_histogram*)h; } +#if ITT_PLATFORM==ITT_PLATFORM_WIN +static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW_v3),_init))(const __itt_domain* domain, const wchar_t* name, __itt_metadata_type type) +{ + __itt_counter_info_t *h_tail = NULL, *h = NULL; + + if (name == NULL || domain == NULL) + { + return NULL; + } + + ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); + if (_N_(_ittapi_global).api_initialized) + { + if (ITTNOTIFY_NAME(counter_createW_v3) && ITTNOTIFY_NAME(counter_createW_v3) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW_v3),_init))) + { + __itt_mutex_unlock(&_N_(_ittapi_global).mutex); + return ITTNOTIFY_NAME(counter_createW_v3)(domain, name, type); + } + else + { + __itt_mutex_unlock(&_N_(_ittapi_global).mutex); + return NULL; + } + } + if (__itt_is_collector_available()) + { + for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next) + { + if (h->nameW != NULL && h->type == (int)type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain->nameW == NULL) || + (h->domainW != NULL && domain->nameW != NULL && !wcscmp(h->domainW, domain->nameW)))) break; + + } + if (h == NULL) + { + NEW_COUNTER_W(&_N_(_ittapi_global),h,h_tail,name,domain->nameW,type); + } + } + __itt_mutex_unlock(&_N_(_ittapi_global).mutex); + return (__itt_counter)h; +} + +static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA_v3),_init))(const __itt_domain* domain, const char* name, __itt_metadata_type type) +#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ +static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_v3),_init))(const __itt_domain* domain, const char* name, __itt_metadata_type type) +#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ +{ + __itt_counter_info_t *h_tail = NULL, *h = NULL; + + if (name == NULL || domain == NULL) + { + return NULL; + } + + ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); + if (_N_(_ittapi_global).api_initialized) + { +#if ITT_PLATFORM==ITT_PLATFORM_WIN + if (ITTNOTIFY_NAME(counter_createA_v3) && ITTNOTIFY_NAME(counter_createA_v3) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA_v3),_init))) + { + __itt_mutex_unlock(&_N_(_ittapi_global).mutex); + return ITTNOTIFY_NAME(counter_createA_v3)(domain, name, type); + } +#else + if (ITTNOTIFY_NAME(counter_create_v3) && ITTNOTIFY_NAME(counter_create_v3) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_v3),_init))) + { + if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); + return ITTNOTIFY_NAME(counter_create_v3)(domain, name, type); + } +#endif + else + { +#if ITT_PLATFORM==ITT_PLATFORM_WIN + __itt_mutex_unlock(&_N_(_ittapi_global).mutex); +#else + if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); +#endif + return NULL; + } + } + if (__itt_is_collector_available()) + { + for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next) + { + if (h->nameA != NULL && h->type == (int)type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain->nameA == NULL) || + (h->domainA != NULL && domain->nameA != NULL && !__itt_fstrcmp(h->domainA, domain->nameA)))) break; + } + if (h == NULL) + { + NEW_COUNTER_A(&_N_(_ittapi_global),h,h_tail,name,domain->nameA,type); + } + } + if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); + return (__itt_counter)h; +} + +static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(bind_context_metadata_to_counter),_init))(__itt_counter counter, size_t length, __itt_context_metadata* metadata) +{ + __itt_counter_metadata *h_tail = NULL, *h = NULL; + + if (counter == NULL || length == 0 || metadata == NULL) + { + return; + } + + ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); + if (_N_(_ittapi_global).api_initialized) + { + if (ITTNOTIFY_NAME(bind_context_metadata_to_counter) && ITTNOTIFY_NAME(bind_context_metadata_to_counter) != ITT_VERSIONIZE(ITT_JOIN(_N_(bind_context_metadata_to_counter),_init))) + { + if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); + ITTNOTIFY_NAME(bind_context_metadata_to_counter)(counter, length, metadata); + } + else + { +#if ITT_PLATFORM==ITT_PLATFORM_WIN + __itt_mutex_unlock(&_N_(_ittapi_global).mutex); +#else + if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); +#endif + return; + } + } + if (__itt_is_collector_available()) + { + size_t item; + char* str_valueA = NULL; +#if ITT_PLATFORM==ITT_PLATFORM_WIN + wchar_t* str_valueW = NULL; +#endif + unsigned long long value = 0; + __itt_context_type type = __itt_context_unknown; + + for (item = 0; item < length; item++) + { + type = metadata[item].type; + for (h_tail = NULL, h = _N_(_ittapi_global).counter_metadata_list; h != NULL; h_tail = h, h = h->next) + { + if (h->counter != NULL && h->counter == counter && h->type == type) break; + } + if (h == NULL && counter != NULL && type != __itt_context_unknown) + { + if (type == __itt_context_nameA || type == __itt_context_deviceA || type == __itt_context_unitsA || type == __itt_context_pci_addrA) + { + str_valueA = (char*)(metadata[item].value); + NEW_COUNTER_METADATA_STR_A(&_N_(_ittapi_global),h,h_tail,counter,type,str_valueA); + } +#if ITT_PLATFORM==ITT_PLATFORM_WIN + else if (type == __itt_context_nameW || type == __itt_context_deviceW || type == __itt_context_unitsW || type == __itt_context_pci_addrW) + { + str_valueW = (wchar_t*)(metadata[item].value); + NEW_COUNTER_METADATA_STR_W(&_N_(_ittapi_global),h,h_tail,counter,type,str_valueW); + } +#endif + else if (type >= __itt_context_tid && type <= __itt_context_cpu_cycles_flag) + { + value = *(unsigned long long*)(metadata[item].value); + NEW_COUNTER_METADATA_NUM(&_N_(_ittapi_global),h,h_tail,counter,type,value); + } + } + } + } + if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); +} /* -------------------------------------------------------------------------- */ static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init))(void) @@ -876,6 +1040,30 @@ static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init))(void) } } +static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(pause_scoped),_init))(__itt_collection_scope scope) +{ + if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) + { + __itt_init_ittlib_name(NULL, __itt_group_all); + } + if (ITTNOTIFY_NAME(pause_scoped) && ITTNOTIFY_NAME(pause_scoped) != ITT_VERSIONIZE(ITT_JOIN(_N_(pause_scoped),_init))) + { + ITTNOTIFY_NAME(pause_scoped)(scope); + } +} + +static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(resume_scoped),_init))(__itt_collection_scope scope) +{ + if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) + { + __itt_init_ittlib_name(NULL, __itt_group_all); + } + if (ITTNOTIFY_NAME(resume_scoped) && ITTNOTIFY_NAME(resume_scoped) != ITT_VERSIONIZE(ITT_JOIN(_N_(resume_scoped),_init))) + { + ITTNOTIFY_NAME(resume_scoped)(scope); + } +} + #if ITT_PLATFORM==ITT_PLATFORM_WIN static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(const wchar_t* name) { @@ -1393,6 +1581,20 @@ static void __itt_free_allocated_resources(void) current_histogram = tmp; } _N_(_ittapi_global).histogram_list = NULL; + + + __itt_counter_metadata* current_counter_metadata = _N_(_ittapi_global).counter_metadata_list; + while (current_counter_metadata != NULL) + { + __itt_counter_metadata* tmp = current_counter_metadata->next; + free((char*)current_counter_metadata->str_valueA); +#if ITT_PLATFORM==ITT_PLATFORM_WIN + free((wchar_t*)current_counter_metadata->str_valueW); +#endif + free(current_counter_metadata); + current_counter_metadata = tmp; + } + _N_(_ittapi_global).counter_metadata_list = NULL; } ITT_EXTERN_C int _N_(init_ittlib)(const char* lib_name, __itt_group_id init_groups) diff --git a/third-party/tbb/src/tbb/tools_api/ittnotify_static.h b/third-party/tbb/src/tbb/tools_api/ittnotify_static.h index d59bfac1..7f572914 100644 --- a/third-party/tbb/src/tbb/tools_api/ittnotify_static.h +++ b/third-party/tbb/src/tbb/tools_api/ittnotify_static.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -66,6 +66,8 @@ ITT_STUB(ITTAPI, __itt_counter, counter_create_typed, (const char *name, con ITT_STUBV(ITTAPI, void, pause, (void), (ITT_NO_PARAMS), pause, __itt_group_control | __itt_group_legacy, "no args") ITT_STUBV(ITTAPI, void, resume, (void), (ITT_NO_PARAMS), resume, __itt_group_control | __itt_group_legacy, "no args") +ITT_STUBV(ITTAPI, void, pause_scoped, (__itt_collection_scope scope), (ITT_FORMAT scope), pause_scoped, __itt_group_control, "%d") +ITT_STUBV(ITTAPI, void, resume_scoped, (__itt_collection_scope scope), (ITT_FORMAT scope), resume_scoped, __itt_group_control, "%d") #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char *name), (ITT_FORMAT name), thread_set_nameA, __itt_group_thread, "\"%s\"") @@ -90,6 +92,15 @@ ITT_STUB(ITTAPI, __itt_histogram*, histogram_createW, (const __itt_domain* domai ITT_STUB(ITTAPI, __itt_histogram*, histogram_create, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type), (ITT_FORMAT domain, name, x_type, y_type), histogram_create, __itt_group_structure, "%p, \"%s\", %d, %d") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ + #if ITT_PLATFORM==ITT_PLATFORM_WIN +ITT_STUB(ITTAPI, __itt_counter, counter_createA_v3, (const __itt_domain* domain, const char *name, __itt_metadata_type type), (ITT_FORMAT domain, name, type), counter_createA_v3, __itt_group_counter, "%p, \"%s\", %d") +ITT_STUB(ITTAPI, __itt_counter, counter_createW_v3, (const __itt_domain* domain, const wchar_t *name, __itt_metadata_type type), (ITT_FORMAT domain, name, type), counter_createW_v3, __itt_group_counter, "%p, \"%s\", %d") +#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ +ITT_STUB(ITTAPI, __itt_counter, counter_create_v3, (const __itt_domain* domain, const char *name, __itt_metadata_type type), (ITT_FORMAT domain, name, type), counter_create_v3, __itt_group_counter, "%p, \"%s\", %d") +#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ + +ITT_STUBV(ITTAPI, void, bind_context_metadata_to_counter, (__itt_counter counter, size_t length, __itt_context_metadata* metadata), (ITT_FORMAT counter, length, metadata), bind_context_metadata_to_counter, __itt_group_structure, "%p, %lu, %p") + #endif /* __ITT_INTERNAL_BODY */ ITT_STUBV(ITTAPI, void, enable_attach, (void), (ITT_NO_PARAMS), enable_attach, __itt_group_all, "no args") @@ -362,4 +373,6 @@ ITT_STUBV(ITTAPI, void, module_unload, (void *start_addr), (ITT_FORMAT start_add ITT_STUBV(ITTAPI, void, histogram_submit, (__itt_histogram* histogram, size_t length, void* x_data, void* y_data), (ITT_FORMAT histogram, length, x_data, y_data), histogram_submit, __itt_group_structure, "%p, %lu, %p, %p") +ITT_STUBV(ITTAPI, void, counter_set_value_v3, (__itt_counter counter, void *value_ptr), (ITT_FORMAT counter, value_ptr), counter_set_value_v3, __itt_group_counter, "%p, %p") + #endif /* __ITT_INTERNAL_INIT */ diff --git a/third-party/tbb/src/tbb/tools_api/legacy/ittnotify.h b/third-party/tbb/src/tbb/tools_api/legacy/ittnotify.h index 1c40c288..837bc480 100644 --- a/third-party/tbb/src/tbb/tools_api/legacy/ittnotify.h +++ b/third-party/tbb/src/tbb/tools_api/legacy/ittnotify.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -39,6 +39,10 @@ # define ITT_OS_FREEBSD 4 #endif /* ITT_OS_FREEBSD */ +#ifndef ITT_OS_OPENBSD +# define ITT_OS_OPENBSD 5 +#endif /* ITT_OS_OPENBSD */ + #ifndef ITT_OS # if defined WIN32 || defined _WIN32 # define ITT_OS ITT_OS_WIN @@ -46,6 +50,8 @@ # define ITT_OS ITT_OS_MAC # elif defined( __FreeBSD__ ) # define ITT_OS ITT_OS_FREEBSD +# elif defined( __OpenBSD__ ) +# define ITT_OS ITT_OS_OPENBSD # else # define ITT_OS ITT_OS_LINUX # endif @@ -67,6 +73,10 @@ # define ITT_PLATFORM_FREEBSD 4 #endif /* ITT_PLATFORM_FREEBSD */ +#ifndef ITT_PLATFORM_OPENBSD +# define ITT_PLATFORM_OPENBSD 5 +#endif /* ITT_PLATFORM_OPENBSD */ + #ifndef ITT_PLATFORM # if ITT_OS==ITT_OS_WIN # define ITT_PLATFORM ITT_PLATFORM_WIN @@ -74,6 +84,8 @@ # define ITT_PLATFORM ITT_PLATFORM_MAC # elif ITT_OS==ITT_OS_FREEBSD # define ITT_PLATFORM ITT_PLATFORM_FREEBSD +# elif ITT_OS==ITT_OS_OPENBSD +# define ITT_PLATFORM ITT_PLATFORM_OPENBSD # else # define ITT_PLATFORM ITT_PLATFORM_POSIX # endif @@ -233,7 +245,7 @@ extern "C" { * only pauses tracing and analyzing memory access. * It does not pause tracing or analyzing threading APIs. * . - * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE: + * - Intel(R) VTune(TM) Profiler: * - Does continue to record when new threads are started. * . * - Other effects: diff --git a/third-party/tbb/src/tbb/waiters.h b/third-party/tbb/src/tbb/waiters.h index 7e0906be..8ed431f8 100644 --- a/third-party/tbb/src/tbb/waiters.h +++ b/third-party/tbb/src/tbb/waiters.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include "oneapi/tbb/detail/_task.h" #include "scheduler_common.h" #include "arena.h" +#include "threading_control.h" namespace tbb { namespace detail { @@ -33,7 +34,7 @@ class waiter_base { bool pause() { if (my_backoff.pause()) { - my_arena.is_out_of_work(); + my_arena.out_of_work(); return true; } @@ -57,6 +58,24 @@ class outermost_worker_waiter : public waiter_base { __TBB_ASSERT(t == nullptr, nullptr); if (is_worker_should_leave(slot)) { + if (!governor::hybrid_cpu()) { + static constexpr std::chrono::microseconds worker_wait_leave_duration(1000); + static_assert(worker_wait_leave_duration > std::chrono::steady_clock::duration(1), "Clock resolution is not enough for measured interval."); + + for (auto t1 = std::chrono::steady_clock::now(), t2 = t1; + std::chrono::duration_cast(t2 - t1) < worker_wait_leave_duration; + t2 = std::chrono::steady_clock::now()) + { + if (!my_arena.is_empty() && !my_arena.is_recall_requested()) { + return true; + } + + if (my_arena.my_threading_control->is_any_other_client_active()) { + break; + } + d0::yield(); + } + } // Leave dispatch loop return false; } @@ -82,7 +101,7 @@ class outermost_worker_waiter : public waiter_base { using base_type = waiter_base; bool is_worker_should_leave(arena_slot& slot) const { - bool is_top_priority_arena = my_arena.my_is_top_priority.load(std::memory_order_relaxed); + bool is_top_priority_arena = my_arena.is_top_priority(); bool is_task_pool_empty = slot.task_pool.load(std::memory_order_relaxed) == EmptyTaskPool; if (is_top_priority_arena) { @@ -109,14 +128,11 @@ class sleep_waiter : public waiter_base { protected: using waiter_base::waiter_base; - bool is_arena_empty() { - return my_arena.my_pool_state.load(std::memory_order_relaxed) == arena::SNAPSHOT_EMPTY; - } - template void sleep(std::uintptr_t uniq_tag, Pred wakeup_condition) { - my_arena.my_market->get_wait_list().wait(wakeup_condition, + my_arena.get_waiting_threads_monitor().wait(wakeup_condition, market_context{uniq_tag, &my_arena}); + reset_wait(); } }; @@ -139,10 +155,9 @@ class external_waiter : public sleep_waiter { return; } - auto wakeup_condition = [&] { return !is_arena_empty() || !my_wait_ctx.continue_execution(); }; + auto wakeup_condition = [&] { return !my_arena.is_empty() || !my_wait_ctx.continue_execution(); }; sleep(std::uintptr_t(&my_wait_ctx), wakeup_condition); - my_backoff.reset_wait(); } d1::wait_context* wait_ctx() { @@ -176,14 +191,9 @@ class coroutine_waiter : public sleep_waiter { suspend_point_type* sp = slot.default_task_dispatcher().m_suspend_point; - auto wakeup_condition = [&] { return !is_arena_empty() || sp->m_is_owner_recalled.load(std::memory_order_relaxed); }; + auto wakeup_condition = [&] { return !my_arena.is_empty() || sp->m_is_owner_recalled.load(std::memory_order_relaxed); }; sleep(std::uintptr_t(sp), wakeup_condition); - my_backoff.reset_wait(); - } - - void reset_wait() { - my_backoff.reset_wait(); } d1::wait_context* wait_ctx() { diff --git a/third-party/tbb/src/tbbbind/CMakeLists.txt b/third-party/tbb/src/tbbbind/CMakeLists.txt index 24cd3e5d..993dc8b8 100644 --- a/third-party/tbb/src/tbbbind/CMakeLists.txt +++ b/third-party/tbb/src/tbbbind/CMakeLists.txt @@ -12,9 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -if (DEFINED CMAKE_SKIP_BUILD_RPATH) - set(CMAKE_SKIP_BUILD_RPATH_OLD_VALUE ${CMAKE_SKIP_BUILD_RPATH}) -endif() set(CMAKE_SKIP_BUILD_RPATH TRUE) function(tbbbind_build TBBBIND_NAME REQUIRED_HWLOC_TARGET) @@ -106,10 +103,3 @@ else() tbbbind_build(tbbbind_2_5 HWLOC::hwloc_2_5 ) endif() - -if (DEFINED CMAKE_SKIP_BUILD_RPATH_OLD_VALUE) - set(CMAKE_SKIP_BUILD_RPATH ${CMAKE_SKIP_BUILD_RPATH_OLD_VALUE}) - unset(CMAKE_SKIP_BUILD_RPATH_OLD_VALUE) -else() - unset(CMAKE_SKIP_BUILD_RPATH) -endif() diff --git a/third-party/tbb/src/tbbbind/def/mac64-tbbbind.def b/third-party/tbb/src/tbbbind/def/mac64-tbbbind.def new file mode 100755 index 00000000..be72bcf9 --- /dev/null +++ b/third-party/tbb/src/tbbbind/def/mac64-tbbbind.def @@ -0,0 +1,18 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +___TBB_internal_initialize_system_topology +___TBB_internal_get_default_concurrency +___TBB_internal_destroy_system_topology + diff --git a/third-party/tbb/src/tbbbind/tbb_bind.cpp b/third-party/tbb/src/tbbbind/tbb_bind.cpp index 38201c71..50119e4e 100644 --- a/third-party/tbb/src/tbbbind/tbb_bind.cpp +++ b/third-party/tbb/src/tbbbind/tbb_bind.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2019-2021 Intel Corporation + Copyright (c) 2019-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -76,7 +76,7 @@ class system_topology { // Binding threads that locate in another Windows Processor groups // is allowed only if machine topology contains several Windows Processors groups - // and process affinity mask wasn`t limited manually (affinity mask cannot violates + // and process affinity mask wasn't limited manually (affinity mask cannot violates // processors group boundaries). bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; } @@ -104,6 +104,7 @@ class system_topology { if ( initialization_state != topology_loaded ) return; +#if __TBB_CPUBIND_PRESENT // Getting process affinity mask if ( intergroup_binding_allowed(groups_num) ) { process_cpu_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology)); @@ -115,6 +116,10 @@ class system_topology { assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0); hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask); } +#else + process_cpu_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology)); + process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology)); +#endif number_of_processors_groups = groups_num; } diff --git a/third-party/tbb/src/tbbbind/tbb_bind.rc b/third-party/tbb/src/tbbbind/tbb_bind.rc index bc060353..2d2b806e 100644 --- a/third-party/tbb/src/tbbbind/tbb_bind.rc +++ b/third-party/tbb/src/tbbbind/tbb_bind.rc @@ -1,4 +1,4 @@ -// Copyright (c) 2005-2023 Intel Corporation +// Copyright (c) 2005-2024 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -54,7 +54,7 @@ BEGIN VALUE "CompanyName", "Intel Corporation\0" VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0" VALUE "FileVersion", TBB_VERSION "\0" - VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation. All Rights Reserved.\0" + VALUE "LegalCopyright", "Copyright 2005-2024 Intel Corporation. All Rights Reserved.\0" VALUE "LegalTrademarks", "\0" #ifndef TBB_USE_DEBUG VALUE "OriginalFilename", "tbbbind.dll\0" diff --git a/third-party/tbb/src/tbbmalloc/CMakeLists.txt b/third-party/tbb/src/tbbmalloc/CMakeLists.txt index 0386daa3..76044fce 100644 --- a/third-party/tbb/src/tbbmalloc/CMakeLists.txt +++ b/third-party/tbb/src/tbbmalloc/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023 Intel Corporation +# Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -109,5 +109,15 @@ target_link_libraries(tbbmalloc ${TBB_COMMON_LINK_LIBS} ) -tbb_install_target(tbbmalloc) +if(TBB_BUILD_APPLE_FRAMEWORKS) + set_target_properties(tbbmalloc PROPERTIES + FRAMEWORK TRUE + FRAMEWORK_VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} + XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER com.intel.tbbmalloc + MACOSX_FRAMEWORK_IDENTIFIER com.intel.tbbmalloc + MACOSX_FRAMEWORK_BUNDLE_VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} + MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${TBBMALLOC_BINARY_VERSION} + ) +endif() +tbb_install_target(tbbmalloc) diff --git a/third-party/tbb/src/tbbmalloc/Synchronize.h b/third-party/tbb/src/tbbmalloc/Synchronize.h index faa6553c..b25d7e24 100644 --- a/third-party/tbb/src/tbbmalloc/Synchronize.h +++ b/third-party/tbb/src/tbbmalloc/Synchronize.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -62,6 +62,10 @@ class MallocMutex : tbb::detail::no_copy { } if (locked) *locked = m_taken; } + + scoped_lock(scoped_lock& other) = delete; + scoped_lock& operator=(scoped_lock&) = delete; + ~scoped_lock() { if (m_taken) { m_mutex.unlock(); diff --git a/third-party/tbb/src/tbbmalloc/TypeDefinitions.h b/third-party/tbb/src/tbbmalloc/TypeDefinitions.h index 81149166..bfadf61d 100644 --- a/third-party/tbb/src/tbbmalloc/TypeDefinitions.h +++ b/third-party/tbb/src/tbbmalloc/TypeDefinitions.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ # define __ARCH_ipf 1 # elif defined(_M_IX86)||defined(__i386__) // the latter for MinGW support # define __ARCH_x86_32 1 -# elif defined(_M_ARM) || defined(_M_ARM64) +# elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) // the latter for MinGW support # define __ARCH_other 1 # else # error Unknown processor architecture for Windows diff --git a/third-party/tbb/src/tbbmalloc/backend.cpp b/third-party/tbb/src/tbbmalloc/backend.cpp index c240e030..87531f81 100644 --- a/third-party/tbb/src/tbbmalloc/backend.cpp +++ b/third-party/tbb/src/tbbmalloc/backend.cpp @@ -297,11 +297,13 @@ inline bool BackendSync::waitTillBlockReleased(intptr_t startModifiedCnt) }; ITT_Guard ittGuard(&inFlyBlocks); #endif - for (intptr_t myBinsInFlyBlocks = inFlyBlocks.load(std::memory_order_acquire), - myCoalescQInFlyBlocks = backend->blocksInCoalescing(); ; backoff.pause()) { + intptr_t myBinsInFlyBlocks = inFlyBlocks.load(std::memory_order_acquire); + intptr_t myCoalescQInFlyBlocks = backend->blocksInCoalescing(); + while (true) { MALLOC_ASSERT(myBinsInFlyBlocks>=0 && myCoalescQInFlyBlocks>=0, nullptr); - intptr_t currBinsInFlyBlocks = inFlyBlocks.load(std::memory_order_acquire), - currCoalescQInFlyBlocks = backend->blocksInCoalescing(); + + intptr_t currBinsInFlyBlocks = inFlyBlocks.load(std::memory_order_acquire); + intptr_t currCoalescQInFlyBlocks = backend->blocksInCoalescing(); WhiteboxTestingYield(); // Stop waiting iff: @@ -317,11 +319,20 @@ inline bool BackendSync::waitTillBlockReleased(intptr_t startModifiedCnt) if (currCoalescQInFlyBlocks > 0 && backend->scanCoalescQ(/*forceCoalescQDrop=*/false)) break; // 4) when there are no blocks - if (!currBinsInFlyBlocks && !currCoalescQInFlyBlocks) + if (!currBinsInFlyBlocks && !currCoalescQInFlyBlocks) { // re-scan make sense only if bins were modified since scanned + auto pool = backend->extMemPool; + if (pool->hardCachesCleanupInProgress.load(std::memory_order_acquire) || + pool->softCachesCleanupInProgress.load(std::memory_order_acquire)) { + backoff.pause(); + continue; + } + return startModifiedCnt != getNumOfMods(); + } myBinsInFlyBlocks = currBinsInFlyBlocks; myCoalescQInFlyBlocks = currCoalescQInFlyBlocks; + backoff.pause(); } return true; } @@ -379,7 +390,7 @@ FreeBlock *Backend::IndexedBins::getFromBin(int binIdx, BackendSync *sync, size_ try_next: FreeBlock *fBlock = nullptr; if (!b->empty()) { - bool locked; + bool locked = false; MallocMutex::scoped_lock scopedLock(b->tLock, wait, &locked); if (!locked) { @@ -505,7 +516,7 @@ void Backend::IndexedBins::addBlock(int binIdx, FreeBlock *fBlock, size_t /* blo bool Backend::IndexedBins::tryAddBlock(int binIdx, FreeBlock *fBlock, bool addToTail) { - bool locked; + bool locked = false; Bin *b = &freeBins[binIdx]; fBlock->myBin = binIdx; if (addToTail) { @@ -597,7 +608,7 @@ FreeBlock *Backend::splitBlock(FreeBlock *fBlock, int num, size_t size, bool blo fBlock = (FreeBlock*)((uintptr_t)splitBlock + splitSize); fBlock->initHeader(); } else { - // For large object blocks cut original block and put free righ part to backend + // For large object blocks cut original block and put free right part to backend splitBlock = (FreeBlock*)((uintptr_t)fBlock + totalSize); splitBlock->initHeader(); } @@ -627,10 +638,12 @@ FreeBlock *Backend::releaseMemInCaches(intptr_t startModifiedCnt, int *lockedBinsThreshold, int numOfLockedBins) { // something released from caches - if (extMemPool->hardCachesCleanup() - // ..or can use blocks that are in processing now - || bkndSync.waitTillBlockReleased(startModifiedCnt)) + if (extMemPool->hardCachesCleanup(false)) return (FreeBlock*)VALID_BLOCK_IN_BIN; + + if (bkndSync.waitTillBlockReleased(startModifiedCnt)) + return (FreeBlock*)VALID_BLOCK_IN_BIN; + // OS can't give us more memory, but we have some in locked bins if (*lockedBinsThreshold && numOfLockedBins) { *lockedBinsThreshold = 0; @@ -737,7 +750,7 @@ void Backend::releaseCachesToLimit() (locMemSoftLimit = memSoftLimit.load(std::memory_order_acquire))) return; // last chance to match memSoftLimit - extMemPool->hardCachesCleanup(); + extMemPool->hardCachesCleanup(true); } int Backend::IndexedBins::getMinNonemptyBin(unsigned startBin) const @@ -794,8 +807,9 @@ FreeBlock *Backend::genericGetBlock(int num, size_t size, bool needAlignedBlock) for (;;) { const intptr_t startModifiedCnt = bkndSync.getNumOfMods(); int numOfLockedBins; - + intptr_t cleanCnt; do { + cleanCnt = backendCleanCnt.load(std::memory_order_acquire); numOfLockedBins = 0; if (needAlignedBlock) { block = freeSlabAlignedBins.findBlock(nativeBin, &bkndSync, num*size, needAlignedBlock, @@ -810,7 +824,8 @@ FreeBlock *Backend::genericGetBlock(int num, size_t size, bool needAlignedBlock) block = freeSlabAlignedBins.findBlock(nativeBin, &bkndSync, num*size, needAlignedBlock, /*alignedBin=*/true, &numOfLockedBins); } - } while (!block && numOfLockedBins>lockedBinsThreshold); + } while (!block && (numOfLockedBins>lockedBinsThreshold || cleanCnt % 2 == 1 || + cleanCnt != backendCleanCnt.load(std::memory_order_acquire))); if (block) break; @@ -1395,7 +1410,10 @@ bool Backend::destroy() bool Backend::clean() { scanCoalescQ(/*forceCoalescQDrop=*/false); - + // Backend::clean is always called under synchronization so only one thread can + // enter to this method at once. + // backendCleanCnt%2== 1 means that clean operation is in progress + backendCleanCnt.fetch_add(1, std::memory_order_acq_rel); bool res = false; // We can have several blocks occupying a whole region, // because such regions are added in advance (see askMemFromOS() and reset()), @@ -1406,7 +1424,7 @@ bool Backend::clean() if (i == freeLargeBlockBins.getMinNonemptyBin(i)) res |= freeLargeBlockBins.tryReleaseRegions(i, this); } - + backendCleanCnt.fetch_add(1, std::memory_order_acq_rel); return res; } @@ -1458,6 +1476,7 @@ size_t Backend::Bin::reportFreeBlocks(FILE *f) for (FreeBlock *fb = head; fb; fb = fb->next) { size_t sz = fb->tryLockBlock(); fb->setMeFree(sz); + fb->rightNeig(sz)->setLeftFree(sz); fprintf(f, " [%p;%p]", fb, (void*)((uintptr_t)fb+sz)); totalSz += sz; } diff --git a/third-party/tbb/src/tbbmalloc/backend.h b/third-party/tbb/src/tbbmalloc/backend.h index 1880ab46..cbf62c0b 100644 --- a/third-party/tbb/src/tbbmalloc/backend.h +++ b/third-party/tbb/src/tbbmalloc/backend.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -265,6 +265,7 @@ class Backend { IndexedBins freeLargeBlockBins, freeSlabAlignedBins; + std::atomic backendCleanCnt; // Our friends friend class BackendSync; diff --git a/third-party/tbb/src/tbbmalloc/frontend.cpp b/third-party/tbb/src/tbbmalloc/frontend.cpp index aa358313..77f9d659 100644 --- a/third-party/tbb/src/tbbmalloc/frontend.cpp +++ b/third-party/tbb/src/tbbmalloc/frontend.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2023 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -186,6 +186,9 @@ class ThreadId { #endif public: ThreadId() : tid(GetMyTID()) {} + ThreadId(ThreadId &other) = delete; + ~ThreadId() = default; + #if USE_PTHREAD bool isCurrentThreadId() const { return pthread_equal(pthread_self(), tid.load(std::memory_order_relaxed)); } #else @@ -196,7 +199,7 @@ class ThreadId { return *this; } static bool init() { return true; } -#if __TBB_SOURCE_DIRECTLY_INCLUDED +#if __TBB_SOURCE_DIRECTLY_INCLUDED static void destroy() {} #endif }; @@ -537,7 +540,6 @@ class FreeBlockPool { std::atomic head; int size; Backend *backend; - bool lastAccessMiss; public: static const int POOL_HIGH_MARK = 32; static const int POOL_LOW_MARK = 8; @@ -591,7 +593,7 @@ class TLSData : public TLSRemote { private: std::atomic unused; public: - TLSData(MemoryPool *mPool, Backend *bknd) : memPool(mPool), freeSlabBlocks(bknd) {} + TLSData(MemoryPool *mPool, Backend *bknd) : memPool(mPool), freeSlabBlocks(bknd), currCacheIdx(0) {} MemoryPool *getMemPool() const { return memPool; } Bin* getAllocationBin(size_t size); void release(); @@ -694,7 +696,7 @@ bool AllLocalCaches::cleanup(bool cleanOnlyUnused) void AllLocalCaches::markUnused() { - bool locked; + bool locked = false; MallocMutex::scoped_lock lock(listLock, /*block=*/false, &locked); if (!locked) // not wait for marking if someone doing something with it return; @@ -1519,7 +1521,7 @@ bool Block::readyToShare() { MallocMutex::scoped_lock scoped_cs(publicFreeListLock); if ( (oldVal=publicFreeList)==nullptr ) - (intptr_t&)(publicFreeList) = UNUSABLE; + publicFreeList = reinterpret_cast(UNUSABLE); } #endif return oldVal==nullptr; @@ -1646,6 +1648,7 @@ bool OrphanedBlocks::cleanup(Backend* backend) FreeBlockPool::ResOfGet FreeBlockPool::getBlock() { Block *b = head.exchange(nullptr); + bool lastAccessMiss; if (b) { size--; @@ -1808,7 +1811,7 @@ void TLSData::release() if (syncOnMailbox) { // Although, we synchronized on nextPrivatizable inside a block, we still need to - // synchronize on the bin lifetime because the thread releasing an object into the public + // synchronize on the bin lifetime because the thread releasing an object into the public // free list is touching the bin (mailbox and mailLock) MallocMutex::scoped_lock scoped_cs(bin[index].mailLock); } @@ -2868,7 +2871,7 @@ void doThreadShutdownNotification(TLSData* tls, bool main_thread) defaultMemPool->onThreadShutdown(defaultMemPool->getTLS(/*create=*/false)); // Take lock to walk through other pools; but waiting might be dangerous at this point // (e.g. on Windows the main thread might deadlock) - bool locked; + bool locked = false; MallocMutex::scoped_lock lock(MemoryPool::memPoolListLock, /*wait=*/!main_thread, &locked); if (locked) { // the list is safe to process for (MemoryPool *memPool = defaultMemPool->next; memPool; memPool = memPool->next) @@ -3295,7 +3298,7 @@ extern "C" int scalable_allocation_command(int cmd, void *param) released = tls->externalCleanup(/*cleanOnlyUnused*/false, /*cleanBins=*/true); break; case TBBMALLOC_CLEAN_ALL_BUFFERS: - released = defaultMemPool->extMemPool.hardCachesCleanup(); + released = defaultMemPool->extMemPool.hardCachesCleanup(true); break; default: return TBBMALLOC_INVALID_PARAM; diff --git a/third-party/tbb/src/tbbmalloc/large_objects.cpp b/third-party/tbb/src/tbbmalloc/large_objects.cpp index 8b470ab5..3454ca62 100644 --- a/third-party/tbb/src/tbbmalloc/large_objects.cpp +++ b/third-party/tbb/src/tbbmalloc/large_objects.cpp @@ -134,7 +134,7 @@ class CacheBinFunctor { public: CacheBinFunctor(typename LargeObjectCacheImpl::CacheBin *bin, ExtMemoryPool *extMemPool, typename LargeObjectCacheImpl::BinBitMask *bitMask, int idx) : - bin(bin), extMemPool(extMemPool), bitMask(bitMask), idx(idx), toRelease(nullptr), needCleanup(false) {} + bin(bin), extMemPool(extMemPool), bitMask(bitMask), idx(idx), toRelease(nullptr), needCleanup(false), currTime(0) {} void operator()(CacheBinOperation* opList); bool isCleanupNeeded() const { return needCleanup; } @@ -1020,17 +1020,33 @@ void ExtMemoryPool::freeLargeObjectList(LargeMemoryBlock *head) bool ExtMemoryPool::softCachesCleanup() { - return loc.regularCleanup(); + bool ret = false; + if (!softCachesCleanupInProgress.exchange(1, std::memory_order_acq_rel)) { + ret = loc.regularCleanup(); + softCachesCleanupInProgress.store(0, std::memory_order_release); + } + return ret; } -bool ExtMemoryPool::hardCachesCleanup() +bool ExtMemoryPool::hardCachesCleanup(bool wait) { + if (hardCachesCleanupInProgress.exchange(1, std::memory_order_acq_rel)) { + if (!wait) + return false; + + AtomicBackoff backoff; + while (hardCachesCleanupInProgress.exchange(1, std::memory_order_acq_rel)) + backoff.pause(); + } + // thread-local caches must be cleaned before LOC, // because object from thread-local cache can be released to LOC bool ret = releaseAllLocalCaches(); ret |= orphanedBlocks.cleanup(&backend); ret |= loc.cleanAll(); ret |= backend.clean(); + + hardCachesCleanupInProgress.store(0, std::memory_order_release); return ret; } diff --git a/third-party/tbb/src/tbbmalloc/large_objects.h b/third-party/tbb/src/tbbmalloc/large_objects.h index ff205ccd..85197842 100644 --- a/third-party/tbb/src/tbbmalloc/large_objects.h +++ b/third-party/tbb/src/tbbmalloc/large_objects.h @@ -80,16 +80,19 @@ struct HugeBinStructureProps { static const unsigned NumBins = (MaxSizeExp - MinSizeExp) * StepFactor; static size_t alignToBin(size_t size) { + MALLOC_ASSERT(size >= StepFactor, "Size must not be less than the StepFactor"); size_t minorStepExp = BitScanRev(size) - StepFactorExp; return alignUp(size, 1ULL << minorStepExp); } - // Sizes between the power of 2 values are aproximated to StepFactor. + // Sizes between the power of 2 values are approximated to StepFactor. static int sizeToIdx(size_t size) { MALLOC_ASSERT(MinSize <= size && size <= MaxSize, ASSERT_TEXT); int sizeExp = (int)BitScanRev(size); // same as __TBB_Log2 + MALLOC_ASSERT(sizeExp >= 0, "A shift amount (sizeExp) must not be negative"); size_t majorStepSize = 1ULL << sizeExp; int minorStepExp = sizeExp - StepFactorExp; + MALLOC_ASSERT(minorStepExp >= 0, "A shift amount (minorStepExp) must not be negative"); int minorIdx = (size - majorStepSize) >> minorStepExp; MALLOC_ASSERT(size == majorStepSize + ((size_t)minorIdx << minorStepExp), "Size is not aligned on the bin"); @@ -240,7 +243,7 @@ class LargeObjectCacheImpl { // for fast finding of used bins and bins with non-zero usedSize; // indexed from the end, as we need largest 1st BinBitMask bitMask; - // bins with lists of recently freed large blocks cached for re-use + // bins with lists of recently freed large blocks cached for reuse CacheBin bin[numBins]; public: diff --git a/third-party/tbb/src/tbbmalloc/tbbmalloc.cpp b/third-party/tbb/src/tbbmalloc/tbbmalloc.cpp index 675726ea..b72e03a7 100644 --- a/third-party/tbb/src/tbbmalloc/tbbmalloc.cpp +++ b/third-party/tbb/src/tbbmalloc/tbbmalloc.cpp @@ -42,7 +42,7 @@ namespace internal { #if _WIN32||_WIN64 #define MALLOCLIB_NAME "tbbmalloc" DEBUG_SUFFIX ".dll" #elif __APPLE__ -#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".dylib" +#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".2.dylib" #elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX || __ANDROID__ #define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".so" #elif __unix__ @@ -98,6 +98,10 @@ struct RegisterProcessShutdownNotification { // MALLOC_ASSERT(ret, "Allocator can't load itself."); dlopen(MALLOCLIB_NAME, RTLD_NOW); } + + RegisterProcessShutdownNotification(RegisterProcessShutdownNotification&) = delete; + RegisterProcessShutdownNotification& operator=(const RegisterProcessShutdownNotification&) = delete; + ~RegisterProcessShutdownNotification() { __TBB_mallocProcessShutdownNotification(false); } diff --git a/third-party/tbb/src/tbbmalloc/tbbmalloc.rc b/third-party/tbb/src/tbbmalloc/tbbmalloc.rc index 77e87ff5..2821adda 100644 --- a/third-party/tbb/src/tbbmalloc/tbbmalloc.rc +++ b/third-party/tbb/src/tbbmalloc/tbbmalloc.rc @@ -1,4 +1,4 @@ -// Copyright (c) 2005-2023 Intel Corporation +// Copyright (c) 2005-2024 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -54,7 +54,7 @@ BEGIN VALUE "CompanyName", "Intel Corporation\0" VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0" VALUE "FileVersion", TBB_VERSION "\0" - VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation. All Rights Reserved.\0" + VALUE "LegalCopyright", "Copyright 2005-2024 Intel Corporation. All Rights Reserved.\0" VALUE "LegalTrademarks", "\0" #ifndef TBB_USE_DEBUG VALUE "OriginalFilename", "tbbmalloc.dll\0" diff --git a/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h b/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h index 352d41a8..44fa47aa 100644 --- a/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h +++ b/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2023 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -102,9 +102,13 @@ void suppress_unused_warning( const T& ) {} /* * Default huge page size */ +#if defined __loongarch64 +static const size_t HUGE_PAGE_SIZE = 32 * 1024 * 1024; +#else static const size_t HUGE_PAGE_SIZE = 2 * 1024 * 1024; +#endif -/********** End of global default constatns *********/ +/********** End of global default constants *********/ /********** Various numeric parameters controlling allocations ********/ @@ -576,6 +580,9 @@ struct ExtMemoryPool { fixedPool; TLSKey tlsPointerKey; // per-pool TLS key + std::atomic softCachesCleanupInProgress; + std::atomic hardCachesCleanupInProgress; + bool init(intptr_t poolId, rawAllocType rawAlloc, rawFreeType rawFree, size_t granularity, bool keepAllMemory, bool fixedPool); bool initTLS(); @@ -586,7 +593,7 @@ struct ExtMemoryPool { // true if something has been released bool softCachesCleanup(); bool releaseAllLocalCaches(); - bool hardCachesCleanup(); + bool hardCachesCleanup(bool wait); void *remap(void *ptr, size_t oldSize, size_t newSize, size_t alignment); bool reset() { loc.reset(); @@ -669,13 +676,16 @@ class RecursiveMallocCallProtector { char scoped_lock_space[sizeof(MallocMutex::scoped_lock)+1]; public: - RecursiveMallocCallProtector() : lock_acquired(nullptr) { lock_acquired = new (scoped_lock_space) MallocMutex::scoped_lock( rmc_mutex ); if (canUsePthread) owner_thread.store(pthread_self(), std::memory_order_relaxed); autoObjPtr.store(&scoped_lock_space, std::memory_order_relaxed); } + + RecursiveMallocCallProtector(RecursiveMallocCallProtector&) = delete; + RecursiveMallocCallProtector& operator=(RecursiveMallocCallProtector) = delete; + ~RecursiveMallocCallProtector() { if (lock_acquired) { autoObjPtr.store(nullptr, std::memory_order_relaxed); diff --git a/third-party/tbb/src/tbbmalloc_proxy/CMakeLists.txt b/third-party/tbb/src/tbbmalloc_proxy/CMakeLists.txt index 5c23f15d..554ddc85 100644 --- a/third-party/tbb/src/tbbmalloc_proxy/CMakeLists.txt +++ b/third-party/tbb/src/tbbmalloc_proxy/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022 Intel Corporation +# Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -90,4 +90,14 @@ target_link_libraries(tbbmalloc_proxy ${TBB_COMMON_LINK_LIBS} ) +if(TBB_BUILD_APPLE_FRAMEWORKS) + set_target_properties(tbbmalloc_proxy PROPERTIES + FRAMEWORK TRUE + FRAMEWORK_VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} + XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER com.intel.tbbmalloc-proxy + MACOSX_FRAMEWORK_IDENTIFIER com.intel.tbbmalloc-proxy + MACOSX_FRAMEWORK_BUNDLE_VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} + MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${TBBMALLOC_BINARY_VERSION}) +endif() + tbb_install_target(tbbmalloc_proxy) diff --git a/third-party/tbb/src/tbbmalloc_proxy/tbbmalloc_proxy.rc b/third-party/tbb/src/tbbmalloc_proxy/tbbmalloc_proxy.rc index 20b3b480..1884b119 100644 --- a/third-party/tbb/src/tbbmalloc_proxy/tbbmalloc_proxy.rc +++ b/third-party/tbb/src/tbbmalloc_proxy/tbbmalloc_proxy.rc @@ -1,4 +1,4 @@ -// Copyright (c) 2005-2023 Intel Corporation +// Copyright (c) 2005-2024 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -54,7 +54,7 @@ BEGIN VALUE "CompanyName", "Intel Corporation\0" VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0" VALUE "FileVersion", TBB_VERSION "\0" - VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation. All Rights Reserved.\0" + VALUE "LegalCopyright", "Copyright 2005-2024 Intel Corporation. All Rights Reserved.\0" VALUE "LegalTrademarks", "\0" #ifndef TBB_USE_DEBUG VALUE "OriginalFilename", "tbbmalloc_proxy.dll\0" diff --git a/third-party/tbb/test/CMakeLists.txt b/third-party/tbb/test/CMakeLists.txt index 05466970..cfde681b 100644 --- a/third-party/tbb/test/CMakeLists.txt +++ b/third-party/tbb/test/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023 Intel Corporation +# Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,6 +29,9 @@ function(tbb_add_test) add_executable(${_tbb_test_TARGET_NAME} ${_tbb_test_SUBDIR}/${_tbb_test_NAME}.cpp) target_include_directories(${_tbb_test_TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}) + # cmake>=3.4 no longer adds flags to export symbols from executables (CMP0065) + set_property(TARGET ${_tbb_test_TARGET_NAME} PROPERTY ENABLE_EXPORTS TRUE) + target_compile_options(${_tbb_test_TARGET_NAME} PRIVATE ${TBB_CXX_STD_FLAG} @@ -40,6 +43,10 @@ function(tbb_add_test) ${TBB_COMMON_COMPILE_FLAGS} ) + if (TBB_BUILD_APPLE_FRAMEWORKS) + add_compile_definitions(TBB_USE_APPLE_FRAMEWORKS) + endif() + if (ANDROID_PLATFORM) # Expand the linker rpath by the CMAKE_LIBRARY_OUTPUT_DIRECTORY path since clang compiler from Android SDK # doesn't respect the -L flag. @@ -446,6 +453,17 @@ if (TARGET TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_eh_thread DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_global_control DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_task DEPENDENCIES TBB::tbb) + + if (TBB_FUZZ_TESTING AND NOT WIN32) + if (NOT ((CMAKE_CXX_COMPILER_ID STREQUAL Clang) OR (CMAKE_CXX_COMPILER_ID STREQUAL IntelLLVM))) + message(FATAL_ERROR "Fuzzing requires Clang or IntelLLVM compiler.") + endif() + tbb_add_test(SUBDIR tbb NAME test_fuzzing) + add_dependencies(test_fuzzing test_task) + target_compile_definitions(test_fuzzing PRIVATE CMD="$ >/dev/null 2>&1") + target_link_options(test_fuzzing PRIVATE -fsanitize=fuzzer) + endif() + tbb_add_test(SUBDIR tbb NAME test_concurrent_monitor DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_scheduler_mix DEPENDENCIES TBB::tbb) @@ -456,9 +474,11 @@ if (TARGET TBB::tbb) ) # HWLOC related test - tbb_add_tbbbind_test(SUBDIR tbb NAME test_arena_constraints) + if (NOT TBB_EMSCRIPTEN) + tbb_add_tbbbind_test(SUBDIR tbb NAME test_arena_constraints) + endif() - if (NOT "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "mips") + if ((NOT "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "mips") AND (NOT TBB_EMSCRIPTEN)) # TODO: Fix for MIPS tbb_add_test(SUBDIR tbb NAME test_tbb_fork DEPENDENCIES TBB::tbb) endif() @@ -533,7 +553,9 @@ if (TARGET TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_graph DEPENDENCIES TBB::tbb) # HWLOC related conformance - tbb_add_tbbbind_test(SUBDIR conformance NAME conformance_arena_constraints) + if (NOT TBB_EMSCRIPTEN) + tbb_add_tbbbind_test(SUBDIR conformance NAME conformance_arena_constraints) + endif() if (MSVC AND BUILD_SHARED_LIBS AND CMAKE_VERSION VERSION_GREATER 3.13) # LINK_OPTIONS property first appeared in 3.13 # version of the CMake @@ -543,7 +565,7 @@ if (TARGET TBB::tbb) target_include_directories(test_implicit_linkage_on_windows PRIVATE $) set_target_properties(test_implicit_linkage_on_windows PROPERTIES - LINK_OPTIONS /LIBPATH:$) + LINK_OPTIONS LINKER:/LIBPATH:$) add_dependencies(test_implicit_linkage_on_windows TBB::tbb) endif() endif() @@ -551,62 +573,63 @@ endif() if (TARGET TBB::tbbmalloc) # TBB allocator tests if (NOT "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "mips") - # Define TBB malloc tests - tbb_add_test(SUBDIR tbbmalloc NAME test_scalable_allocator DEPENDENCIES TBB::tbbmalloc) - tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_pools DEPENDENCIES TBB::tbbmalloc) - tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_init_shutdown DEPENDENCIES TBB::tbbmalloc) - tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_regression DEPENDENCIES TBB::tbbmalloc) - if (TARGET TBB::tbb) - tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_shutdown_hang DEPENDENCIES TBB::tbb TBB::tbbmalloc) - endif() - - if (NOT (WINDOWS_STORE OR TBB_WINDOWS_DRIVER)) - # TODO: Consider adding following tests on WINDOWS_STORE and TBB_WINDOWS_DRIVER platforms - tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_compliance DEPENDENCIES TBB::tbbmalloc) - tbb_add_lib_test(SUBDIR tbbmalloc NAME test_malloc_used_by_lib DEPENDENCIES TBB::tbbmalloc) - tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_used_by_lib DEPENDENCIES _test_malloc_used_by_lib) - tbb_add_lib_test(SUBDIR tbbmalloc NAME test_malloc_lib_unload) - tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_lib_unload DEPENDENCIES _test_malloc_lib_unload) - endif() + if (NOT TBB_EMSCRIPTEN) + # Define TBB malloc tests + tbb_add_test(SUBDIR tbbmalloc NAME test_scalable_allocator DEPENDENCIES TBB::tbbmalloc) + tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_pools DEPENDENCIES TBB::tbbmalloc) + tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_init_shutdown DEPENDENCIES TBB::tbbmalloc) + tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_regression DEPENDENCIES TBB::tbbmalloc) + if (TARGET TBB::tbb) + tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_shutdown_hang DEPENDENCIES TBB::tbb TBB::tbbmalloc) + endif() - enable_language(C) - tbb_add_c_test(SUBDIR tbbmalloc NAME test_malloc_pure_c DEPENDENCIES TBB::tbbmalloc) + if (NOT (WINDOWS_STORE OR TBB_WINDOWS_DRIVER)) + # TODO: Consider adding following tests on WINDOWS_STORE and TBB_WINDOWS_DRIVER platforms + tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_compliance DEPENDENCIES TBB::tbbmalloc) + tbb_add_lib_test(SUBDIR tbbmalloc NAME test_malloc_used_by_lib DEPENDENCIES TBB::tbbmalloc) + tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_used_by_lib DEPENDENCIES _test_malloc_used_by_lib) + tbb_add_lib_test(SUBDIR tbbmalloc NAME test_malloc_lib_unload) + tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_lib_unload DEPENDENCIES _test_malloc_lib_unload) + endif() + enable_language(C) + tbb_add_c_test(SUBDIR tbbmalloc NAME test_malloc_pure_c DEPENDENCIES TBB::tbbmalloc) + endif() # ---------------------------------------------------------------------------------------- # Whitebox testing - - add_executable(test_malloc_whitebox tbbmalloc/test_malloc_whitebox.cpp) - - target_include_directories(test_malloc_whitebox - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/../include - ${CMAKE_CURRENT_SOURCE_DIR}/.. - ${CMAKE_CURRENT_SOURCE_DIR}) - target_compile_definitions(test_malloc_whitebox PRIVATE __TBBMALLOC_BUILD) - target_compile_options(test_malloc_whitebox - PRIVATE - ${TBB_CXX_STD_FLAG} - ${TBB_WARNING_SUPPRESS} - ${TBB_TEST_COMPILE_FLAGS} - ${TBB_COMMON_COMPILE_FLAGS} - ${TBBMALLOC_LIB_COMPILE_FLAGS} - ) - if (ANDROID_PLATFORM) - add_test(NAME test_malloc_whitebox - COMMAND ${CMAKE_COMMAND} - -DBINARIES_PATH=${CMAKE_LIBRARY_OUTPUT_DIRECTORY} - -DTEST_NAME=test_malloc_whitebox - -P ${PROJECT_SOURCE_DIR}/cmake/android/test_launcher.cmake) - else() - add_test(NAME test_malloc_whitebox COMMAND test_malloc_whitebox --force-colors=1) - endif() - if (COMMAND target_link_options) - target_link_options(test_malloc_whitebox PRIVATE ${TBB_COMMON_LINK_FLAGS}) - else() - target_link_libraries(test_malloc_whitebox PRIVATE ${TBB_COMMON_LINK_FLAGS}) + if (NOT TBB_EMSCRIPTEN) + add_executable(test_malloc_whitebox tbbmalloc/test_malloc_whitebox.cpp) + + target_include_directories(test_malloc_whitebox + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../include + ${CMAKE_CURRENT_SOURCE_DIR}/.. + ${CMAKE_CURRENT_SOURCE_DIR}) + target_compile_definitions(test_malloc_whitebox PRIVATE __TBBMALLOC_BUILD) + target_compile_options(test_malloc_whitebox + PRIVATE + ${TBB_CXX_STD_FLAG} + ${TBB_WARNING_SUPPRESS} + ${TBB_TEST_COMPILE_FLAGS} + ${TBB_COMMON_COMPILE_FLAGS} + ${TBBMALLOC_LIB_COMPILE_FLAGS} + ) + if (ANDROID_PLATFORM) + add_test(NAME test_malloc_whitebox + COMMAND ${CMAKE_COMMAND} + -DBINARIES_PATH=${CMAKE_LIBRARY_OUTPUT_DIRECTORY} + -DTEST_NAME=test_malloc_whitebox + -P ${PROJECT_SOURCE_DIR}/cmake/android/test_launcher.cmake) + else() + add_test(NAME test_malloc_whitebox COMMAND test_malloc_whitebox --force-colors=1) + endif() + if (COMMAND target_link_options) + target_link_options(test_malloc_whitebox PRIVATE ${TBB_COMMON_LINK_FLAGS}) + else() + target_link_libraries(test_malloc_whitebox PRIVATE ${TBB_COMMON_LINK_FLAGS}) + endif() + target_link_libraries(test_malloc_whitebox PRIVATE Threads::Threads ${TBB_COMMON_LINK_LIBS}) endif() - target_link_libraries(test_malloc_whitebox PRIVATE Threads::Threads ${TBB_COMMON_LINK_LIBS}) - # ------------------------------------------------------------------------------------------ # Define TBB malloc conformance tests @@ -624,7 +647,9 @@ if (TARGET TBB::tbbmalloc) if (BUILD_SHARED_LIBS AND NOT TBB_SANITIZE MATCHES "thread" AND TBBMALLOC_PROXY_BUILD AND NOT MSVC_CXX_ARCHITECTURE_ID MATCHES "ARM64") # Define TBB malloc proxy tests tbb_add_lib_test(SUBDIR tbbmalloc NAME test_malloc_atexit DEPENDENCIES TBB::tbbmalloc_proxy TBB::tbbmalloc) - tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_atexit DEPENDENCIES TBB::tbbmalloc_proxy TBB::tbbmalloc _test_malloc_atexit) + if (NOT TBB_EMSCRIPTEN) + tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_atexit DEPENDENCIES TBB::tbbmalloc_proxy TBB::tbbmalloc _test_malloc_atexit) + endif() tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_overload DEPENDENCIES TBB::tbbmalloc_proxy) tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_overload_disable DEPENDENCIES TBB::tbbmalloc_proxy TBB::tbbmalloc) # safer_msize call need to be available tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_new_handler DEPENDENCIES TBB::tbbmalloc_proxy) diff --git a/third-party/tbb/test/common/common_arena_constraints.h b/third-party/tbb/test/common/common_arena_constraints.h index 2c84b260..4f2da920 100644 --- a/third-party/tbb/test/common/common_arena_constraints.h +++ b/third-party/tbb/test/common/common_arena_constraints.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2019-2022 Intel Corporation + Copyright (c) 2019-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -89,6 +89,7 @@ int get_processors_group_count() { return 1; } #define __HWLOC_HYBRID_CPUS_INTERFACES_VALID (!_WIN32 || _WIN64) #define __HYBRID_CPUS_TESTING __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT && __HWLOC_HYBRID_CPUS_INTERFACES_VALID +#define __HWLOC_CPUBIND_PRESENT (!__APPLE__) // Macro to check hwloc interfaces return codes #define hwloc_require_ex(command, ...) \ @@ -179,12 +180,16 @@ class system_info { #endif hwloc_require_ex(hwloc_topology_load, topology); +#if __HWLOC_CPUBIND_PRESENT if ( get_processors_group_count() > 1 ) { process_cpuset = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset(topology)); } else { process_cpuset = hwloc_bitmap_alloc(); hwloc_require_ex(hwloc_get_cpubind, topology, process_cpuset, 0); } +#else + process_cpuset = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset(topology)); +#endif hwloc_obj_t current_numa_node = nullptr; index_info current_node_info{}; @@ -349,7 +354,11 @@ class system_info { static affinity_mask allocate_current_affinity_mask() { affinity_mask result = hwloc_bitmap_alloc(); instance().memory_handler.insert(result); +#if __HWLOC_CPUBIND_PRESENT hwloc_require_ex(hwloc_get_cpubind, instance().topology, result, HWLOC_CPUBIND_THREAD); +#else + hwloc_bitmap_copy(result, hwloc_topology_get_complete_cpuset(instance().topology)); +#endif REQUIRE_MESSAGE(!hwloc_bitmap_iszero(result), "Empty current affinity mask."); return result; } diff --git a/third-party/tbb/test/common/concurrency_tracker.h b/third-party/tbb/test/common/concurrency_tracker.h index d2397cd1..fcc4a191 100644 --- a/third-party/tbb/test/common/concurrency_tracker.h +++ b/third-party/tbb/test/common/concurrency_tracker.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -112,7 +112,7 @@ struct ExactConcurrencyLevel : NoCopy { public: void operator()(size_t) const { size_t v = ++myActiveBodyCnt; - CHECK_MESSAGE(v <= myConcLevel, "Number of active bodies is too high."); + REQUIRE_MESSAGE(v <= myConcLevel, "Number of active bodies is too high."); if (v == myConcLevel) // record that the max expected concurrency was observed myReachedMax = true; // try to get barrier when 1st time in the thread diff --git a/third-party/tbb/test/common/concurrent_associative_common.h b/third-party/tbb/test/common/concurrent_associative_common.h index 2a302f66..ecf28168 100644 --- a/third-party/tbb/test/common/concurrent_associative_common.h +++ b/third-party/tbb/test/common/concurrent_associative_common.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -369,7 +369,7 @@ void check_unequal( const T& cont1, const T& cont2 ) { REQUIRE_MESSAGE(cont1 != cont2, "Containers should be unequal"); REQUIRE_MESSAGE(cont2 != cont1, "Containers should be unequal"); REQUIRE_MESSAGE(!(cont1 == cont2), "Containers should not be equal"); - REQUIRE_MESSAGE(!(cont2 == cont1), "Containers shuold not be equal"); + REQUIRE_MESSAGE(!(cont2 == cont1), "Containers should not be equal"); } // Break value for maps diff --git a/third-party/tbb/test/common/cpu_usertime.h b/third-party/tbb/test/common/cpu_usertime.h index b0b0201d..dd71283b 100644 --- a/third-party/tbb/test/common/cpu_usertime.h +++ b/third-party/tbb/test/common/cpu_usertime.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -41,7 +41,7 @@ inline double GetCPUUserTime() { #elif _WIN32 FILETIME my_times[4]; bool status = GetProcessTimes(GetCurrentProcess(), my_times, my_times+1, my_times+2, my_times+3)!=0; - CHECK( status ); + CHECK_FAST(status); LARGE_INTEGER usrtime; usrtime.LowPart = my_times[3].dwLowDateTime; usrtime.HighPart = my_times[3].dwHighDateTime; diff --git a/third-party/tbb/test/common/doctest.h b/third-party/tbb/test/common/doctest.h index 8714c5b2..413a5b3f 100644 --- a/third-party/tbb/test/common/doctest.h +++ b/third-party/tbb/test/common/doctest.h @@ -1555,10 +1555,13 @@ DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wunused-comparison") , m_at(at) {} DOCTEST_NOINLINE operator Result() { +// TODO: upstream the change to doctest : Work-around for the warning: 'address will never be NULL' +DOCTEST_GCC_SUPPRESS_WARNING_WITH_PUSH("-Waddress") // this is needed only for MSVC 2015 DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(4800) // 'int': forcing value to bool bool res = static_cast(lhs); DOCTEST_MSVC_SUPPRESS_WARNING_POP +DOCTEST_GCC_SUPPRESS_WARNING_POP if(m_at & assertType::is_false) { //!OCLINT bitwise operator in conditional res = !res; } diff --git a/third-party/tbb/test/common/utils_concurrency_limit.h b/third-party/tbb/test/common/utils_concurrency_limit.h index 4b1e8d20..9d0b3c77 100644 --- a/third-party/tbb/test/common/utils_concurrency_limit.h +++ b/third-party/tbb/test/common/utils_concurrency_limit.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2020-2022 Intel Corporation + Copyright (c) 2020-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -287,27 +287,44 @@ bool can_change_thread_priority() { return false; } -void increase_thread_priority() { #if __unix__ - pthread_t this_thread = pthread_self(); - sched_param params; - params.sched_priority = sched_get_priority_max(SCHED_FIFO); - ASSERT(params.sched_priority != -1, nullptr); - int err = pthread_setschedparam(this_thread, SCHED_FIFO, ¶ms); - ASSERT(err == 0, "Can not change thread priority."); -#endif -} +class increased_priority_guard { +public: + increased_priority_guard() : m_backup(get_current_schedparam()) { + increase_thread_priority(); + } -void decrease_thread_priority() { -#if __unix__ - pthread_t this_thread = pthread_self(); - sched_param params; - params.sched_priority = sched_get_priority_min(SCHED_FIFO); - ASSERT(params.sched_priority != -1, nullptr); - int err = pthread_setschedparam(this_thread, SCHED_FIFO, ¶ms); - ASSERT(err == 0, "Can not change thread priority."); + ~increased_priority_guard() { + // restore priority on destruction + pthread_t this_thread = pthread_self(); + int err = pthread_setschedparam(this_thread, + /*policy*/ m_backup.first, /*sched_param*/ &m_backup.second); + ASSERT(err == 0, nullptr); + } +private: + std::pair get_current_schedparam() { + pthread_t this_thread = pthread_self(); + sched_param params; + int policy = 0; + int err = pthread_getschedparam(this_thread, &policy, ¶ms); + ASSERT(err == 0, nullptr); + return std::make_pair(policy, params); + } + + void increase_thread_priority() { + pthread_t this_thread = pthread_self(); + sched_param params; + params.sched_priority = sched_get_priority_max(SCHED_FIFO); + ASSERT(params.sched_priority != -1, nullptr); + int err = pthread_setschedparam(this_thread, SCHED_FIFO, ¶ms); + ASSERT(err == 0, "Can not change thread priority."); + } + + std::pair m_backup; +}; +#else + class increased_priority_guard{}; #endif -} } // namespace utils diff --git a/third-party/tbb/test/common/utils_dynamic_libs.h b/third-party/tbb/test/common/utils_dynamic_libs.h index c84beac7..5e5365fc 100644 --- a/third-party/tbb/test/common/utils_dynamic_libs.h +++ b/third-party/tbb/test/common/utils_dynamic_libs.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -46,9 +46,17 @@ namespace utils { #endif #define EXT ".dll" #else +#if TBB_USE_APPLE_FRAMEWORKS +#define PREFIX // When built as Apple* Framework, the binary has no lib prefix +#else #define PREFIX "lib" +#endif #if __APPLE__ +#if TBB_USE_APPLE_FRAMEWORKS +#define EXT // When built as Apple* Framework, the binary has no extension +#else #define EXT ".dylib" +#endif // Android SDK build system does not support .so file name versioning #elif __FreeBSD__ || __NetBSD__ || __sun || _AIX || __ANDROID__ #define EXT ".so" @@ -58,10 +66,15 @@ namespace utils { #error Unknown OS #endif #endif +#if TBB_USE_APPLE_FRAMEWORKS +#define MALLOCFRAMEWORK "tbbmalloc.framework/" +#else +#define MALLOCFRAMEWORK +#endif // Form the names of the TBB memory allocator binaries. -#define MALLOCLIB_NAME1 PREFIX "tbbmalloc" SUFFIX1 EXT -#define MALLOCLIB_NAME2 PREFIX "tbbmalloc" SUFFIX2 EXT +#define MALLOCLIB_NAME1 MALLOCFRAMEWORK PREFIX "tbbmalloc" SUFFIX1 EXT +#define MALLOCLIB_NAME2 MALLOCFRAMEWORK PREFIX "tbbmalloc" SUFFIX2 EXT #if _WIN32 || _WIN64 using LIBRARY_HANDLE = HMODULE; diff --git a/third-party/tbb/test/conformance/conformance_blocked_rangeNd.cpp b/third-party/tbb/test/conformance/conformance_blocked_rangeNd.cpp index de54169c..52faac52 100644 --- a/third-party/tbb/test/conformance/conformance_blocked_rangeNd.cpp +++ b/third-party/tbb/test/conformance/conformance_blocked_rangeNd.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2017-2021 Intel Corporation + Copyright (c) 2017-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -245,6 +245,7 @@ TEST_CASE("Serial test") { SerialTest(); } +#if !EMSCRIPTEN //! Testing blocked_rangeNd interface with parallel_for //! \brief \ref requirement TEST_CASE("Parallel test") { @@ -253,6 +254,7 @@ TEST_CASE("Parallel test") { ParallelTest(); } } +#endif //! Testing blocked_rangeNd with proportional splitting //! \brief \ref interface \ref requirement diff --git a/third-party/tbb/test/conformance/conformance_concurrent_hash_map.cpp b/third-party/tbb/test/conformance/conformance_concurrent_hash_map.cpp index 21f06144..0c3ec6e9 100644 --- a/third-party/tbb/test/conformance/conformance_concurrent_hash_map.cpp +++ b/third-party/tbb/test/conformance/conformance_concurrent_hash_map.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -1487,9 +1487,9 @@ void test_heterogeneous_lookup() { test_heterogeneous_equal_range(); } -//! Test consruction with hash_compare +//! Test construction with hash_compare //! \brief \ref interface \ref requirement -TEST_CASE("testing consruction with hash_compare") { +TEST_CASE("testing construction with hash_compare") { TestHashCompareConstructors(); } diff --git a/third-party/tbb/test/conformance/conformance_concurrent_queue.cpp b/third-party/tbb/test/conformance/conformance_concurrent_queue.cpp index 10db09fb..32c1652e 100644 --- a/third-party/tbb/test/conformance/conformance_concurrent_queue.cpp +++ b/third-party/tbb/test/conformance/conformance_concurrent_queue.cpp @@ -1547,9 +1547,9 @@ TEST_CASE("testing iterators") { TestQueueIteratorWorks(); } -//! Test concurrent oprations support +//! Test concurrent operations support //! \brief \ref requirement -TEST_CASE("testing concurrent oprations support") { +TEST_CASE("testing concurrent operations support") { TestConcurrentPushPop(); } diff --git a/third-party/tbb/test/conformance/conformance_concurrent_vector.cpp b/third-party/tbb/test/conformance/conformance_concurrent_vector.cpp index 18e8bc3d..8bc67a60 100644 --- a/third-party/tbb/test/conformance/conformance_concurrent_vector.cpp +++ b/third-party/tbb/test/conformance/conformance_concurrent_vector.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -1224,7 +1224,7 @@ void TestDeductionGuides() { TVector v1(v.begin(), v.end()); static_assert(std::is_same>::value); - // check TVector(InputIterator, InputIterator, Alocator) + // check TVector(InputIterator, InputIterator, Allocator) TVector v2(v.begin(), v.end(), std::allocator()); static_assert(std::is_same>>::value); @@ -1234,7 +1234,7 @@ void TestDeductionGuides() { static_assert(std::is_same>::value); - // check TVector(std::initializer_list, Alocator) + // check TVector(std::initializer_list, Allocator) TVector v4(l, std::allocator()); static_assert(std::is_same>>::value); diff --git a/third-party/tbb/test/conformance/conformance_flowgraph.h b/third-party/tbb/test/conformance/conformance_flowgraph.h index e3926a73..46aecfc1 100644 --- a/third-party/tbb/test/conformance/conformance_flowgraph.h +++ b/third-party/tbb/test/conformance/conformance_flowgraph.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2020-2022 Intel Corporation + Copyright (c) 2020-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -626,28 +626,31 @@ void test_copy_ctor_for_buffering_nodes(Args... node_args) { template void test_priority(Args... node_args) { - std::size_t concurrency_limit = 1; - oneapi::tbb::global_control control(oneapi::tbb::global_control::max_allowed_parallelism, concurrency_limit); - oneapi::tbb::flow::graph g; + oneapi::tbb::flow::graph g; oneapi::tbb::flow::continue_node source(g, dummy_functor()); track_first_id_functor::first_id = -1; track_first_id_functor low_functor(1); track_first_id_functor high_functor(2); + // Due to args... we cannot create the nodes inside the lambda with old compilers Node high(g, node_args..., high_functor, oneapi::tbb::flow::node_priority_t(1)); Node low(g, node_args..., low_functor); - make_edge(source, low); - make_edge(source, high); + tbb::task_arena a(1, 1); + a.execute([&] { + g.reset(); // attach to this arena - source.try_put(oneapi::tbb::flow::continue_msg()); + make_edge(source, low); + make_edge(source, high); + source.try_put(oneapi::tbb::flow::continue_msg()); - g.wait_for_all(); + g.wait_for_all(); - CHECK_MESSAGE((track_first_id_functor::first_id == 2), "High priority node should execute first"); + CHECK_MESSAGE((track_first_id_functor::first_id == 2), "High priority node should execute first"); + }); } template diff --git a/third-party/tbb/test/conformance/conformance_global_control.cpp b/third-party/tbb/test/conformance/conformance_global_control.cpp index d4ea1908..578ae780 100644 --- a/third-party/tbb/test/conformance/conformance_global_control.cpp +++ b/third-party/tbb/test/conformance/conformance_global_control.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -64,12 +64,14 @@ void TestStackSizeThreadsControl() { utils::NativeParallelFor( threads, StackSizeRun(threads, &barr1, &barr2) ); } -void RunWorkersLimited(size_t parallelism, bool wait) +void RunWorkersLimited(size_t parallelism) { oneapi::tbb::global_control s(oneapi::tbb::global_control::max_allowed_parallelism, parallelism); - // try both configuration with already sleeping workers and with not yet sleeping - if (wait) - utils::Sleep(10); + // TODO: consider better testing approach + // Sleep is required because after destruction global_control on the previous iteration, + // it recalls the maximum concurrency and excessive worker threads might populate the arena. + // So, we need to wait when arena becomes empty but it is unreliable and might sporadically fail. + utils::Sleep(100); const std::size_t expected_threads = (utils::get_platform_max_threads()==1)? 1 : parallelism; utils::ExactConcurrencyLevel::check(expected_threads); } @@ -90,12 +92,10 @@ void TestWorkersConstraints() } const size_t limit_par = utils::min(max_parallelism, 4U); // check that constrains are really met - for (int wait=0; wait<2; wait++) { - for (size_t num=2; num1; num--) - RunWorkersLimited(num, wait==1); - } + for (size_t num=2; num1; num--) + RunWorkersLimited(num); } struct SetUseRun: utils::NoAssign { @@ -136,6 +136,10 @@ void TestAutoInit() if (max_parallelism > 2) { // after autoinit it's possible to decrease workers number oneapi::tbb::global_control s(oneapi::tbb::global_control::max_allowed_parallelism, max_parallelism-1); + // TODO: consider better testing approach + // Sleep is required because after previous concurrency check, the arena is still populated with workers. + // So, we need to wait when arena becomes empty but it is unreliable and might sporadically fail. + utils::Sleep(100); utils::ExactConcurrencyLevel::check(max_parallelism-1); } } @@ -201,7 +205,16 @@ TEST_CASE("setting stack size") { //! \brief \ref interface \ref requirement TEST_CASE("setting max number of threads") { TestWorkersConstraints(); +} +//! Testing concurrenct setting concurrency +//! \brief \ref interface \ref requirement +TEST_CASE("concurrenct setting concurrency") { TestConcurrentSetUseConcurrency(); +} + +//! Testing auto initialization +//! \brief \ref interface \ref requirement +TEST_CASE("auto initialization") { TestAutoInit(); } @@ -336,7 +349,7 @@ TEST_CASE("simple prolong lifetime 3") { // The test cannot work correctly with statically linked runtime. // TODO: investigate a failure in debug with MSVC -#if !_MSC_VER || (defined(_DLL) && !defined(_DEBUG)) +#if (!_MSC_VER || (defined(_DLL) && !defined(_DEBUG))) && !EMSCRIPTEN #include // Overall, the test case is not safe because the dtors might not be called during long jump. diff --git a/third-party/tbb/test/conformance/conformance_parallel_for.cpp b/third-party/tbb/test/conformance/conformance_parallel_for.cpp index 44903f06..463ea526 100644 --- a/third-party/tbb/test/conformance/conformance_parallel_for.cpp +++ b/third-party/tbb/test/conformance/conformance_parallel_for.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2023 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -399,7 +399,9 @@ TEST_CASE("Flog test") { Flog(); Flog(); Flog(); +#if !EMSCRIPTEN Flog(); +#endif } //! Testing parallel for with different types and step diff --git a/third-party/tbb/test/conformance/conformance_parallel_for_each.cpp b/third-party/tbb/test/conformance/conformance_parallel_for_each.cpp index ad8ee672..e36a2803 100644 --- a/third-party/tbb/test/conformance/conformance_parallel_for_each.cpp +++ b/third-party/tbb/test/conformance/conformance_parallel_for_each.cpp @@ -102,10 +102,8 @@ class ForEachInvokeItem { void do_action_and_feed(oneapi::tbb::feeder& feeder) const { CHECK_MESSAGE(change_vector.size() % 2 == 0, "incorrect test setup"); std::size_t shift = change_vector.size() / 2; - std::cout << "Process " << real_value << std::endl; ++change_vector[real_value]; if (real_value < shift) { - std::cout << "Add " << real_value + shift << std::endl; feeder.add(ForEachInvokeItem(real_value + shift, change_vector)); } } diff --git a/third-party/tbb/test/conformance/conformance_parallel_reduce.cpp b/third-party/tbb/test/conformance/conformance_parallel_reduce.cpp index cf3aee9b..0214bfd9 100644 --- a/third-party/tbb/test/conformance/conformance_parallel_reduce.cpp +++ b/third-party/tbb/test/conformance/conformance_parallel_reduce.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2023 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include "common/test_invoke.h" #include "../tbb/test_partitioner.h" +#include //! \file conformance_parallel_reduce.cpp //! \brief Test for [algorithms.parallel_reduce algorithms.parallel_deterministic_reduce] specification @@ -56,6 +57,59 @@ struct ReduceBody { } }; +template +class MoveOnlyWrapper { +public: + MoveOnlyWrapper() = default; + MoveOnlyWrapper(const T& obj) : my_obj(obj) {} + + MoveOnlyWrapper(MoveOnlyWrapper&&) = default; + MoveOnlyWrapper& operator=(MoveOnlyWrapper&&) = default; + + MoveOnlyWrapper(const MoveOnlyWrapper&) = delete; + MoveOnlyWrapper& operator=(const MoveOnlyWrapper&) = delete; + + bool operator==(const MoveOnlyWrapper& other) const { return my_obj == other.my_obj; } +private: + T my_obj; +}; // class MoveOnlyWrapper + +// The container wrapper that is copyable but the copy constructor fails if the source container is non-empty +// If such an empty container is provided as an identity into parallel reduce algorithm with rvalue-friendly body, +// it should only call the copy constructor while broadcasting the identity element into the leafs +// and the identity element is an empty container for the further test +template +class EmptyCopyList { +public: + EmptyCopyList() = default; + + EmptyCopyList(EmptyCopyList&&) = default; + EmptyCopyList& operator=(EmptyCopyList&&) = default; + + EmptyCopyList(const EmptyCopyList& other) { + REQUIRE_MESSAGE(other.my_list.empty(), "reduce copied non-identity list"); + } + EmptyCopyList& operator=(const EmptyCopyList& other) { + REQUIRE_MESSAGE(other.my_list.empty(), "reduce copied non-identity list"); + return *this; + } + + typename std::list::iterator insert(typename std::list::const_iterator pos, T&& item) { + return my_list.insert(pos, std::move(item)); + } + + void splice(typename std::list::const_iterator pos, EmptyCopyList&& other) { + my_list.splice(pos, std::move(other.my_list)); + } + + typename std::list::const_iterator end() const { return my_list.end(); } + + bool operator==(const EmptyCopyList& other) const { return my_list == other.my_list; } + +private: + std::list my_list; +}; // class EmptyCopyList + template void TestDeterministicReductionFor() { const int N = 1000; @@ -174,3 +228,109 @@ TEST_CASE("parallel_[deterministic_]reduce and std::invoke") { } #endif + +template +void test_vector_of_lists_rvalue_reduce_basic(const Runner& runner, PartitionerContext&&... args) { + constexpr std::size_t n_vectors = 10000; + + using inner_type = MoveOnlyWrapper; + using list_type = EmptyCopyList; + using vector_of_lists_type = std::vector; + + vector_of_lists_type vector_of_lists; + + vector_of_lists.reserve(n_vectors); + for (std::size_t i = 0; i < n_vectors; ++i) { + list_type list; + + list.insert(list.end(), inner_type{1}); + list.insert(list.end(), inner_type{2}); + list.insert(list.end(), inner_type{3}); + list.insert(list.end(), inner_type{4}); + list.insert(list.end(), inner_type{5}); + vector_of_lists.emplace_back(std::move(list)); + } + + oneapi::tbb::blocked_range range(0, n_vectors, n_vectors * 2); + + auto reduce_body = [&](const decltype(range)& range_obj, list_type&& x) { + list_type new_list = std::move(x); + + for (std::size_t index = range_obj.begin(); index != range_obj.end(); ++index) { + new_list.splice(new_list.end(), std::move(vector_of_lists[index])); + } + return new_list; + }; + + auto join_body = [&](list_type&& x, list_type&& y) { + list_type new_list = std::move(x); + + new_list.splice(new_list.end(), std::move(y)); + return new_list; + }; + + list_type result = runner(range, list_type{}, reduce_body, join_body, std::forward(args)...); + + list_type expected_result; + + for (std::size_t i = 0; i < n_vectors; ++i) { + expected_result.insert(expected_result.end(), inner_type{1}); + expected_result.insert(expected_result.end(), inner_type{2}); + expected_result.insert(expected_result.end(), inner_type{3}); + expected_result.insert(expected_result.end(), inner_type{4}); + expected_result.insert(expected_result.end(), inner_type{5}); + } + + REQUIRE_MESSAGE(expected_result == result, "Incorrect reduce result"); +} + +struct ReduceRunner { + template + auto operator()(Args&&... args) const -> decltype(oneapi::tbb::parallel_reduce(std::forward(args)...)) { + return oneapi::tbb::parallel_reduce(std::forward(args)...); + } +}; + +struct DeterministicReduceRunner { + template + auto operator()(Args&&... args) const -> decltype(oneapi::tbb::parallel_deterministic_reduce(std::forward(args)...)) { + return oneapi::tbb::parallel_deterministic_reduce(std::forward(args)...); + } +}; + +void test_vector_of_lists_rvalue_reduce() { + ReduceRunner runner; + oneapi::tbb::affinity_partitioner af_partitioner; + oneapi::tbb::task_group_context context; + + test_vector_of_lists_rvalue_reduce_basic(runner); + test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::auto_partitioner{}); + test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{}); + test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{}); + test_vector_of_lists_rvalue_reduce_basic(runner, af_partitioner); + + test_vector_of_lists_rvalue_reduce_basic(runner, context); + test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::auto_partitioner{}, context); + test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{}, context); + test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{}, context); + test_vector_of_lists_rvalue_reduce_basic(runner, af_partitioner, context); +} + +void test_vector_of_lists_rvalue_deterministic_reduce() { + DeterministicReduceRunner runner; + oneapi::tbb::task_group_context context; + + test_vector_of_lists_rvalue_reduce_basic(runner); + test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{}); + test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{}); + + test_vector_of_lists_rvalue_reduce_basic(runner, context); + test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{}, context); + test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{}, context); +} + +//! \brief \ref interface \ref requirement +TEST_CASE("test rvalue optimization") { + test_vector_of_lists_rvalue_reduce(); + test_vector_of_lists_rvalue_deterministic_reduce(); +} diff --git a/third-party/tbb/test/tbb/test_arena_constraints.cpp b/third-party/tbb/test/tbb/test_arena_constraints.cpp index 9264b870..846bf0fc 100644 --- a/third-party/tbb/test/tbb/test_arena_constraints.cpp +++ b/third-party/tbb/test/tbb/test_arena_constraints.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2019-2022 Intel Corporation + Copyright (c) 2019-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ #include "tbb/parallel_for.h" -#if __TBB_HWLOC_VALID_ENVIRONMENT +#if __TBB_HWLOC_VALID_ENVIRONMENT && __HWLOC_CPUBIND_PRESENT //! Test affinity and default_concurrency correctness for all available constraints. //! \brief \ref error_guessing TEST_CASE("Test affinity and default_concurrency correctness for all available constraints.") { @@ -87,7 +87,7 @@ TEST_CASE("Test constraints propagation during arenas copy construction") { test_constraints_affinity_and_concurrency(constraints, copied_affinity); } } -#endif /*__TBB_HWLOC_VALID_ENVIRONMENT*/ +#endif /*__TBB_HWLOC_VALID_ENVIRONMENT && __HWLOC_CPUBIND_PRESENT */ // The test cannot be stabilized with TBB malloc under Thread Sanitizer #if !__TBB_USE_THREAD_SANITIZER diff --git a/third-party/tbb/test/tbb/test_arena_priorities.cpp b/third-party/tbb/test/tbb/test_arena_priorities.cpp index 06e199ce..ba160df8 100644 --- a/third-party/tbb/test/tbb/test_arena_priorities.cpp +++ b/third-party/tbb/test/tbb/test_arena_priorities.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2020-2021 Intel Corporation + Copyright (c) 2020-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -361,7 +361,6 @@ void test() { // TODO: nested arena case - //! Test for setting a priority to arena //! \brief \ref requirement TEST_CASE("Arena priorities") { diff --git a/third-party/tbb/test/tbb/test_async_node.cpp b/third-party/tbb/test/tbb/test_async_node.cpp index 4ea8a688..edab0c38 100644 --- a/third-party/tbb/test/tbb/test_async_node.cpp +++ b/third-party/tbb/test/tbb/test_async_node.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -831,9 +831,9 @@ TEST_CASE("Inner enqueuing test"){ } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET -//! Test deprecated follows and preceedes API +//! Test deprecated follows and precedes API //! \brief \ref error_guessing -TEST_CASE("Test follows and preceedes API"){ +TEST_CASE("Test follows and precedes API"){ test_follows_and_precedes_api(); } #endif diff --git a/third-party/tbb/test/tbb/test_broadcast_node.cpp b/third-party/tbb/test/tbb/test_broadcast_node.cpp index 8effa862..b3905e6d 100644 --- a/third-party/tbb/test/tbb/test_broadcast_node.cpp +++ b/third-party/tbb/test/tbb/test_broadcast_node.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -267,9 +267,9 @@ TEST_CASE("Resets"){ } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET -//! Test deprecated follows and preceedes API +//! Test deprecated follows and precedes API //! \brief \ref error_guessing -TEST_CASE("Follows and preceedes API"){ +TEST_CASE("Follows and precedes API"){ test_follows_and_precedes_api(); } #endif diff --git a/third-party/tbb/test/tbb/test_buffer_node.cpp b/third-party/tbb/test/tbb/test_buffer_node.cpp index 6bc2a22a..89f4485b 100644 --- a/third-party/tbb/test/tbb/test_buffer_node.cpp +++ b/third-party/tbb/test/tbb/test_buffer_node.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -480,7 +480,7 @@ TEST_CASE("Resets"){ } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET -//! Test deprecated follows and preceedes API +//! Test deprecated follows and precedes API //! \brief \ref error_guessing TEST_CASE("Follows and precedes API"){ test_follows_and_precedes_api(); diff --git a/third-party/tbb/test/tbb/test_collaborative_call_once.cpp b/third-party/tbb/test/tbb/test_collaborative_call_once.cpp index d8ee09fd..11a04a10 100644 --- a/third-party/tbb/test/tbb/test_collaborative_call_once.cpp +++ b/third-party/tbb/test/tbb/test_collaborative_call_once.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2022 Intel Corporation + Copyright (c) 2022-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -206,6 +206,7 @@ TEST_CASE("only calls once - move only argument") { } } +#if !EMSCRIPTEN //! Stress test for functor to be called only once //! \brief \ref interface \ref requirement \ref stress TEST_CASE("only calls once - stress test") { @@ -246,7 +247,7 @@ TEST_CASE("only calls once - stress test") { }); } } - +#endif #if TBB_USE_EXCEPTIONS //! Test for collaborative_call_once exception handling @@ -324,6 +325,7 @@ TEST_CASE("handles exceptions - stress test") { #endif +#if !EMSCRIPTEN //! Test for multiple help from moonlighting threads //! \brief \ref interface \ref requirement TEST_CASE("multiple help") { @@ -341,6 +343,7 @@ TEST_CASE("multiple help") { }); }); } +#endif //! Test for collaborative work from different arenas //! \brief \ref interface \ref requirement diff --git a/third-party/tbb/test/tbb/test_concurrent_hash_map.cpp b/third-party/tbb/test/tbb/test_concurrent_hash_map.cpp index a05782a7..fcef8263 100644 --- a/third-party/tbb/test/tbb/test_concurrent_hash_map.cpp +++ b/third-party/tbb/test/tbb/test_concurrent_hash_map.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -570,13 +570,13 @@ class MinimalisticMutex { bool upgrade_to_writer() const { // upgrade_to_writer should return false if the mutex simulates - // reaquiring the lock on upgrade operation + // reacquiring the lock on upgrade operation return !SimulateReacquiring; } bool downgrade_to_reader() const { // downgrade_to_reader should return false if the mutex simulates - // reaquiring the lock on upgrade operation + // reacquiring the lock on upgrade operation return !SimulateReacquiring; } diff --git a/third-party/tbb/test/tbb/test_concurrent_queue.cpp b/third-party/tbb/test/tbb/test_concurrent_queue.cpp index cfaaf226..18267615 100644 --- a/third-party/tbb/test/tbb/test_concurrent_queue.cpp +++ b/third-party/tbb/test/tbb/test_concurrent_queue.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -219,6 +219,7 @@ struct TrackableItem { auto it = object_addresses.find(this); CHECK(it != object_addresses.end()); object_addresses.erase(it); + CHECK(object_addresses.count(this) == 0); } }; diff --git a/third-party/tbb/test/tbb/test_continue_node.cpp b/third-party/tbb/test/tbb/test_continue_node.cpp index abdc6d1d..8c2c5c5b 100644 --- a/third-party/tbb/test/tbb/test_continue_node.cpp +++ b/third-party/tbb/test/tbb/test_continue_node.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -371,7 +371,7 @@ TEST_CASE("Two graphs") { test_two_graphs(); } TEST_CASE( "Lightweight policy" ) { test_lightweight_policy(); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET -//! Test deprecated follows and preceedes API +//! Test deprecated follows and precedes API //! \brief \ref error_guessing TEST_CASE( "Support for follows and precedes API" ) { test_follows_and_precedes_api(); } #endif diff --git a/third-party/tbb/test/tbb/test_eh_algorithms.cpp b/third-party/tbb/test/tbb/test_eh_algorithms.cpp index b5879bbb..7a2b59b4 100644 --- a/third-party/tbb/test/tbb/test_eh_algorithms.cpp +++ b/third-party/tbb/test/tbb/test_eh_algorithms.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -29,9 +29,9 @@ #include "tbb/parallel_pipeline.h" #include "tbb/blocked_range.h" #include "tbb/task_group.h" -#include "tbb/global_control.h" #include "tbb/concurrent_unordered_map.h" #include "tbb/task.h" +#include "tbb/global_control.h" //! \file test_eh_algorithms.cpp //! \brief Test for [algorithms.parallel_for algorithms.parallel_reduce algorithms.parallel_deterministic_reduce algorithms.parallel_for_each algorithms.parallel_pipeline algorithms.parallel_pipeline.flow_control] specifications @@ -388,18 +388,20 @@ TEST_CASE("parallel_for and parallel_reduce exception handling test #0") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - Test0(); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + Test0(); + } + }); } } } - +#if !EMSCRIPTEN //! Testing parallel_for and parallel_reduce exception handling //! \brief \ref error_guessing TEST_CASE("parallel_for and parallel_reduce exception handling test #1") { @@ -407,14 +409,16 @@ TEST_CASE("parallel_for and parallel_reduce exception handling test #1") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - Test1(); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + Test1(); + } + }); } } } @@ -426,14 +430,16 @@ TEST_CASE("parallel_for and parallel_reduce exception handling test #2") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - Test2(); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + Test2(); + } + }); } } } @@ -445,14 +451,16 @@ TEST_CASE("parallel_for and parallel_reduce exception handling test #3") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - Test3(); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + Test3(); + } + }); } } } @@ -464,20 +472,22 @@ TEST_CASE("parallel_for and parallel_reduce exception handling test #4") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - Test4(); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + Test4(); + } + }); } } } +#endif #endif /* TBB_USE_EXCEPTIONS */ - class ParForBodyToCancel { public: void operator()( const range_type& ) const { @@ -674,18 +684,21 @@ TEST_CASE("parallel_for and parallel_reduce cancellation test #1") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - TestCancelation1(); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + TestCancelation1(); + } + }); } } } +#if !EMSCRIPTEN //! Testing parallel_for and parallel_reduce cancellation //! \brief \ref error_guessing TEST_CASE("parallel_for and parallel_reduce cancellation test #2") { @@ -693,17 +706,20 @@ TEST_CASE("parallel_for and parallel_reduce cancellation test #2") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - TestCancelation2(); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + TestCancelation2(); + } + }); } } } +#endif //! Testing parallel_for and parallel_reduce cancellation //! \brief \ref error_guessing @@ -712,14 +728,16 @@ TEST_CASE("parallel_for and parallel_reduce cancellation test #3") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - TestCancelation3(); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + TestCancelation3(); + } + }); } } } @@ -731,14 +749,16 @@ TEST_CASE("parallel_for and parallel_reduce cancellation test #4") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - TestCancelation4(); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + TestCancelation4(); + } + }); } } } @@ -1015,6 +1035,7 @@ void Test5_parallel_for_each () { } } // void Test5_parallel_for_each () +#if !EMSCRIPTEN //! Testing parallel_for_each exception handling //! \brief \ref error_guessing TEST_CASE("parallel_for_each exception handling test #1") { @@ -1022,17 +1043,20 @@ TEST_CASE("parallel_for_each exception handling test #1") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - RunWithSimpleBody(Test1_parallel_for_each, SimpleParForEachBody); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + RunWithSimpleBody(Test1_parallel_for_each, SimpleParForEachBody); + } + }); } } } +#endif //! Testing parallel_for_each exception handling //! \brief \ref error_guessing @@ -1041,18 +1065,21 @@ TEST_CASE("parallel_for_each exception handling test #2") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - RunWithTemplatedBody(Test2_parallel_for_each, OuterParForEachBody); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + RunWithTemplatedBody(Test2_parallel_for_each, OuterParForEachBody); + } + }); } } } +#if !EMSCRIPTEN //! Testing parallel_for_each exception handling //! \brief \ref error_guessing TEST_CASE("parallel_for_each exception handling test #3") { @@ -1060,17 +1087,20 @@ TEST_CASE("parallel_for_each exception handling test #3") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - RunWithTemplatedBody(Test3_parallel_for_each, OuterParForEachBodyWithIsolatedCtx); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + RunWithTemplatedBody(Test3_parallel_for_each, OuterParForEachBodyWithIsolatedCtx); + } + }); } } } +#endif //! Testing parallel_for_each exception handling //! \brief \ref error_guessing @@ -1079,18 +1109,21 @@ TEST_CASE("parallel_for_each exception handling test #4") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - RunWithTemplatedBody(Test4_parallel_for_each, OuterParForEachWithEhBody); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + RunWithTemplatedBody(Test4_parallel_for_each, OuterParForEachWithEhBody); + } + }); } } } +#if !EMSCRIPTEN //! Testing parallel_for_each exception handling //! \brief \ref error_guessing TEST_CASE("parallel_for_each exception handling test #5") { @@ -1098,20 +1131,22 @@ TEST_CASE("parallel_for_each exception handling test #5") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - Test5_parallel_for_each >(); - Test5_parallel_for_each >(); - Test5_parallel_for_each >(); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + Test5_parallel_for_each >(); + Test5_parallel_for_each >(); + Test5_parallel_for_each >(); + } + }); } } } - +#endif #endif /* TBB_USE_EXCEPTIONS */ class ParForEachBodyToCancel { @@ -1189,6 +1224,7 @@ void TestCancelation2_parallel_for_each () { RunCancellationTest, Cancellator2>(); } +#if !EMSCRIPTEN //! Testing parallel_for_each cancellation test //! \brief \ref error_guessing TEST_CASE("parallel_for_each cancellation test #1") { @@ -1196,13 +1232,15 @@ TEST_CASE("parallel_for_each cancellation test #1") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - RunWithSimpleBody(TestCancelation1_parallel_for_each, ParForEachBodyToCancel); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + RunWithSimpleBody(TestCancelation1_parallel_for_each, ParForEachBodyToCancel); + } + }); } } } @@ -1214,17 +1252,20 @@ TEST_CASE("parallel_for_each cancellation test #2") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - - RunWithSimpleBody(TestCancelation2_parallel_for_each, ParForEachBodyToCancel2); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + + RunWithSimpleBody(TestCancelation2_parallel_for_each, ParForEachBodyToCancel2); + } + }); } } } +#endif //////////////////////////////////////////////////////////////////////////////// // Tests for tbb::parallel_pipeline @@ -1555,17 +1596,19 @@ void TestWithDifferentFiltersAndConcurrency() { // Execute in all the possible modes for ( size_t j = 0; j < 4; ++j ) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - g_NumTokens = 2 * g_NumThreads; - - for ( int i = 0; i < NumFilterTypes; ++i ) { - for ( int n = 0; n < NumFilterTypes; ++n ) { - for ( int k = 0; k < 2; ++k ) - testFunc( FilterSet(modes[i], modes[n], k == 0, k != 0) ); + tbb::task_arena a(g_NumThreads); + a.execute([&] { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + g_NumTokens = 2 * g_NumThreads; + + for (int i = 0; i < NumFilterTypes; ++i) { + for (int n = 0; n < NumFilterTypes; ++n) { + for (int k = 0; k < 2; ++k) + testFunc(FilterSet(modes[i], modes[n], k == 0, k != 0)); + } } - } + }); } } } @@ -1574,6 +1617,7 @@ void TestWithDifferentFiltersAndConcurrency() { #endif } +#if !EMSCRIPTEN //! Testing parallel_pipeline exception handling //! \brief \ref error_guessing TEST_CASE("parallel_pipeline exception handling test #1") { @@ -1597,7 +1641,7 @@ TEST_CASE("parallel_pipeline exception handling test #3") { TEST_CASE("parallel_pipeline exception handling test #4") { TestWithDifferentFiltersAndConcurrency(); } - +#endif #endif /* TBB_USE_EXCEPTIONS */ class FilterToCancel { @@ -1678,19 +1722,22 @@ TEST_CASE("parallel_pipeline cancellation test #1") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - g_NumTokens = 2 * g_NumThreads; - - TestCancelation1_pipeline(); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + g_NumTokens = 2 * g_NumThreads; + + TestCancelation1_pipeline(); + } + }); } } } +#if !EMSCRIPTEN //! Testing parallel_pipeline cancellation //! \brief \ref error_guessing TEST_CASE("parallel_pipeline cancellation test #2") { @@ -1698,15 +1745,18 @@ TEST_CASE("parallel_pipeline cancellation test #2") { g_NumThreads = static_cast(concurrency_level); g_Master = std::this_thread::get_id(); if (g_NumThreads > 1) { - tbb::global_control control(tbb::global_control::max_allowed_parallelism, g_NumThreads); - // Execute in all the possible modes - for ( size_t j = 0; j < 4; ++j ) { - g_ExceptionInMaster = (j & 1) != 0; - g_SolitaryException = (j & 2) != 0; - g_NumTokens = 2 * g_NumThreads; - - TestCancelation2_pipeline(); - } + tbb::task_arena a(g_NumThreads); + a.execute([] { + // Execute in all the possible modes + for (size_t j = 0; j < 4; ++j) { + g_ExceptionInMaster = (j & 1) != 0; + g_SolitaryException = (j & 2) != 0; + g_NumTokens = 2 * g_NumThreads; + + TestCancelation2_pipeline(); + } + }); } } } +#endif diff --git a/third-party/tbb/test/tbb/test_eh_flow_graph.cpp b/third-party/tbb/test/tbb/test_eh_flow_graph.cpp index ab331551..015d196e 100644 --- a/third-party/tbb/test/tbb/test_eh_flow_graph.cpp +++ b/third-party/tbb/test/tbb/test_eh_flow_graph.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -1928,7 +1928,7 @@ class Foo { } }; -// test from user ahelwer: http://software.intel.com/en-us/forums/showthread.php?t=103786 +// test from user ahelwer: https://community.intel.com/t5/Intel-oneAPI-Threading-Building/Exception-in-flow-graph-results-in-graph-wait-for-all-hanging/td-p/789352 // exception thrown in graph node, not caught in wait_for_all() void test_flow_graph_exception0() { @@ -2017,6 +2017,7 @@ void TestOneThreadNum(int nThread) { ); } +#if !EMSCRIPTEN //! Test exceptions with parallelism //! \brief \ref error_guessing TEST_CASE("Testing several threads"){ @@ -2026,5 +2027,5 @@ TEST_CASE("Testing several threads"){ TestOneThreadNum(nThread); } } - +#endif #endif // TBB_USE_EXCEPTIONS diff --git a/third-party/tbb/test/tbb/test_eh_thread.cpp b/third-party/tbb/test/tbb/test_eh_thread.cpp index d5af9db6..a5ac1c8a 100644 --- a/third-party/tbb/test/tbb/test_eh_thread.cpp +++ b/third-party/tbb/test/tbb/test_eh_thread.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2020-2022 Intel Corporation + Copyright (c) 2020-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -75,7 +75,7 @@ class Thread { mValid = false; pthread_attr_t attr; // Limit the stack size not to consume all virtual memory on 32 bit platforms. - std::size_t stacksize = utils::max(128*1024, PTHREAD_STACK_MIN); + std::size_t stacksize = utils::max(std::size_t(128*1024), std::size_t(PTHREAD_STACK_MIN)); if (pthread_attr_init(&attr) == 0 && pthread_attr_setstacksize(&attr, stacksize) == 0) { mValid = pthread_create(&mHandle, &attr, thread_routine, /* arg = */ nullptr) == 0; } diff --git a/third-party/tbb/test/tbb/test_flow_graph_priorities.cpp b/third-party/tbb/test/tbb/test_flow_graph_priorities.cpp index 5c798063..483daadb 100644 --- a/third-party/tbb/test/tbb/test_flow_graph_priorities.cpp +++ b/third-party/tbb/test/tbb/test_flow_graph_priorities.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2018-2021 Intel Corporation + Copyright (c) 2018-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -842,6 +842,7 @@ TEST_CASE("Priority nodes take precedence"){ } } +#if !EMSCRIPTEN //! Test thread eager reaction //! \brief \ref error_guessing TEST_CASE("Thread eager reaction"){ @@ -849,6 +850,7 @@ TEST_CASE("Thread eager reaction"){ ThreadsEagerReaction::test( static_cast(p) ); } } +#endif //! Test prioritization under concurrency limits //! \brief \ref error_guessing @@ -888,3 +890,4 @@ TEST_CASE("Exceptions") { Exceptions::test(); } #endif + diff --git a/third-party/tbb/test/tbb/test_fuzzing.cpp b/third-party/tbb/test/tbb/test_fuzzing.cpp new file mode 100644 index 00000000..38cd7f8a --- /dev/null +++ b/third-party/tbb/test/tbb/test_fuzzing.cpp @@ -0,0 +1,41 @@ +/* + Copyright (c) 2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +//! \file test_fuzzing.cpp +//! \brief Test the [internal] of environment variables + +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + FuzzedDataProvider provider(data, size); + for (auto var : {"INTEL_ITTNOTIFY_GROUPS", "INTEL_LIBITTNOTIFY32", + "INTEL_LIBITTNOTIFY64", "KMP_FOR_TCHECK", "KMP_FOR_TPROFILE", + "TBB_ENABLE_SANITIZERS", "TBB_MALLOC_DISABLE_REPLACEMENT", + "TBB_MALLOC_SET_HUGE_SIZE_THRESHOLD", + "TBB_MALLOC_USE_HUGE_PAGES", "TBB_VERSION"}) { + std::string val = provider.ConsumeRandomLengthString(); +#if _WIN32 + _putenv_s(var, val.c_str()); +#else + setenv(var, val.c_str(), 1); +#endif + } + + if (std::system(CMD) != 0) + __builtin_trap(); + + return 0; +} diff --git a/third-party/tbb/test/tbb/test_global_control.cpp b/third-party/tbb/test/tbb/test_global_control.cpp index 23bdc243..fddbbaf6 100644 --- a/third-party/tbb/test/tbb/test_global_control.cpp +++ b/third-party/tbb/test/tbb/test_global_control.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -237,7 +237,7 @@ TEST_CASE("prolong lifetime auto init") { //! Testing lifetime control advanced //! \brief \ref error_guessing TEST_CASE("prolong lifetime advanced") { - // Exceptions test leaves auto-initialized sheduler after, + // Exceptions test leaves auto-initialized scheduler after, // because all blocking terminate calls are inside the parallel region, // thus resulting in false termination result. utils::NativeParallelFor(1, @@ -245,9 +245,30 @@ TEST_CASE("prolong lifetime advanced") { } #endif +#if !EMSCRIPTEN //! Testing multiple wait //! \brief \ref error_guessing TEST_CASE("prolong lifetime multiple wait") { TestBlockingTerminateNS::TestMultpleWait(); } +#endif + +//! \brief \ref regression +TEST_CASE("test concurrent task_scheduler_handle destruction") { + std::atomic stop{ false }; + std::thread thr1([&] { + while (!stop) { + auto h = tbb::task_scheduler_handle{ tbb::attach{} }; + tbb::finalize(h, std::nothrow_t{}); + } + }); + for (int i = 0; i < 1000; ++i) { + std::thread thr2([] { + tbb::parallel_for(0, 1, [](int) {}); + }); + thr2.join(); + } + stop = true; + thr1.join(); +} diff --git a/third-party/tbb/test/tbb/test_join_node_preview.cpp b/third-party/tbb/test/tbb/test_join_node_preview.cpp index ced50bb9..4bcb1900 100644 --- a/third-party/tbb/test/tbb/test_join_node_preview.cpp +++ b/third-party/tbb/test/tbb/test_join_node_preview.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2022 Intel Corporation + Copyright (c) 2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -84,7 +84,7 @@ void test_follows_and_precedes_api() { //! Test follows and precedes API //! \brief \ref error_guessing -TEST_CASE("Test follows and preceedes API"){ +TEST_CASE("Test follows and precedes API"){ test_follows_and_precedes_api(); } diff --git a/third-party/tbb/test/tbb/test_limiter_node.cpp b/third-party/tbb/test/tbb/test_limiter_node.cpp index 7743a377..897f840d 100644 --- a/third-party/tbb/test/tbb/test_limiter_node.cpp +++ b/third-party/tbb/test/tbb/test_limiter_node.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -222,7 +222,7 @@ int test_serial() { return 0; } -// reported bug in limiter (http://software.intel.com/en-us/comment/1752355) +// reported bug in limiter (https://community.intel.com/t5/Intel-oneAPI-Threading-Building/multifun-node-try-put-several-messages-to-one-successor-crashes/m-p/922844) #define DECREMENT_OUTPUT 1 // the port number of the decrement output of the multifunction_node #define LIMITER_OUTPUT 0 // port number of the integer output diff --git a/third-party/tbb/test/tbb/test_mutex.cpp b/third-party/tbb/test/tbb/test_mutex.cpp index bc7b79e3..5b78f173 100644 --- a/third-party/tbb/test/tbb/test_mutex.cpp +++ b/third-party/tbb/test/tbb/test_mutex.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -109,7 +109,6 @@ void TestTransaction(const char* name) REQUIRE_MESSAGE(n_transactions_attempted.load(std::memory_order_relaxed), "ERROR for " << name << ": transactions were never attempted"); } - //! \brief \ref error_guessing TEST_CASE("Transaction test") { if (have_TSX()) { @@ -119,6 +118,7 @@ TEST_CASE("Transaction test") { } #endif /* __TBB_TSX_TESTING_ENABLED_FOR_THIS_COMPILER */ + //! \brief \ref error_guessing TEST_CASE("test upgrade/downgrade with spin_rw_mutex") { test_rwm_upgrade_downgrade(); @@ -144,10 +144,12 @@ TEST_CASE("test spin_mutex with native threads") { test_with_native_threads::test(); } +#if !EMSCRIPTEN //! \brief \ref error_guessing TEST_CASE("test queuing_mutex with native threads") { test_with_native_threads::test(); } +#endif //! \brief \ref error_guessing TEST_CASE("test mutex with native threads") { @@ -160,11 +162,13 @@ TEST_CASE("test spin_rw_mutex with native threads") { test_with_native_threads::test_rw(); } +#if !EMSCRIPTEN //! \brief \ref error_guessing TEST_CASE("test queuing_rw_mutex with native threads") { test_with_native_threads::test(); test_with_native_threads::test_rw(); } +#endif //! \brief \ref error_guessing TEST_CASE("test rw_mutex with native threads") { @@ -197,3 +201,4 @@ TEST_CASE("internal mutex concepts") { tbb::null_rw_mutex, tbb::queuing_rw_mutex>); } #endif // __TBB_CPP20_CONCEPTS_PRESENT + diff --git a/third-party/tbb/test/tbb/test_parallel_for_each.cpp b/third-party/tbb/test/tbb/test_parallel_for_each.cpp index f6bb5090..3dfc107e 100644 --- a/third-party/tbb/test/tbb/test_parallel_for_each.cpp +++ b/third-party/tbb/test/tbb/test_parallel_for_each.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,6 +22,105 @@ //! \file test_parallel_for_each.cpp //! \brief Test for [algorithms.parallel_for_each] +#if __TBB_CPP20_PRESENT +// Fancy iterator type that models the C++20 iterator type +// that defines the real iterator category using iterator_concept type +// and iterator_category is always std::input_iterator_type +// Similar iterators are used by C++20 ranges (e.g. std::ranges::iota_view::iterator) +// parallel_for_each algorithm should detect such iterators with respect to iterator_concept value + +template +struct cpp20_iterator { + static_assert(std::derived_from, + "cpp20_iterator should be of at least forward iterator category"); + + using iterator_concept = Category; + using iterator_category = std::input_iterator_tag; + using value_type = T; + using difference_type = std::ptrdiff_t; + + cpp20_iterator() = default; + explicit cpp20_iterator(T* ptr) : my_ptr(ptr) {} + + T& operator*() const { return *my_ptr; } + + cpp20_iterator& operator++() { + ++my_ptr; + return *this; + } + + cpp20_iterator operator++(int) { + auto it = *this; + ++*this; + return it; + } + + cpp20_iterator& operator--() + requires std::derived_from + { + --my_ptr; + return *this; + } + + cpp20_iterator operator--(int) + requires std::derived_from + { + auto it = *this; + --*this; + return it; + } + + cpp20_iterator& operator+=(difference_type n) + requires std::derived_from + { + my_ptr += n; + return *this; + } + + cpp20_iterator& operator-=(difference_type n) + requires std::derived_from + { + my_ptr -= n; + return *this; + } + + T& operator[](difference_type n) const + requires std::derived_from + { + return my_ptr[n]; + } + + friend bool operator==(const cpp20_iterator&, const cpp20_iterator&) = default; + + friend auto operator<=>(const cpp20_iterator&, const cpp20_iterator&) + requires std::derived_from = default; + + friend cpp20_iterator operator+(cpp20_iterator i, difference_type n) + requires std::derived_from + { + return cpp20_iterator(i.my_ptr + n); + } + + friend cpp20_iterator operator+(difference_type n, cpp20_iterator i) + requires std::derived_from + { + return i + n; + } + + friend cpp20_iterator operator-(cpp20_iterator i, difference_type n) + requires std::derived_from + { + return cpp20_iterator(i.my_ptr - n); + } + + friend difference_type operator-(const cpp20_iterator& x, const cpp20_iterator& y) { + return x.my_ptr - y.my_ptr; + } +private: + T* my_ptr = nullptr; +}; // class cpp20_iterator +#endif // __TBB_CPP20_PRESENT + //! Test forward access iterator support //! \brief \ref error_guessing \ref interface TEST_CASE("Forward iterator support") { @@ -172,3 +271,65 @@ TEST_CASE("parallel_for_each constraints") { } #endif // __TBB_CPP20_CONCEPTS_PRESENT + +#if __TBB_CPP20_PRESENT + +struct no_copy_move { + no_copy_move() = default; + + no_copy_move(const no_copy_move&) = delete; + no_copy_move(no_copy_move&&) = delete; + + no_copy_move& operator=(const no_copy_move&) = delete; + no_copy_move& operator=(no_copy_move&&) = delete; + + int item = 0; +}; + +template +void test_with_cpp20_iterator() { + constexpr std::size_t n = 1'000'000; + + std::vector elements(n); + + cpp20_iterator begin(elements.data()); + cpp20_iterator end(elements.data() + n); + + oneapi::tbb::parallel_for_each(begin, end, [](no_copy_move& element) { + element.item = 42; + }); + + for (std::size_t index = 0; index < n; ++index) { + CHECK(elements[index].item == 42); + } +} + +//! \brief \ref error_guessing \ref regression +TEST_CASE("parallel_for_each with cpp20 iterator") { + // Test that parallel_for_each threats ignores iterator_category type + // if iterator_concept type is defined for iterator + + // For input iterators parallel_for_each requires element to be + // copyable or movable so since cpp20_iterator is at least forward + // parallel_for_each should work with cpp20_iterator + // on non-copyable and non-movable type + + // test cpp20_iterator implementation + using cpp20_forward_iterator = cpp20_iterator; + using cpp20_bidirectional_iterator = cpp20_iterator; + using cpp20_random_access_iterator = cpp20_iterator; + + static_assert(std::forward_iterator); + static_assert(!std::bidirectional_iterator); + + static_assert(std::bidirectional_iterator); + static_assert(!std::random_access_iterator); + + static_assert(std::random_access_iterator); + + test_with_cpp20_iterator(); + test_with_cpp20_iterator(); + test_with_cpp20_iterator(); +} + +#endif // __TBB_CPP20_PRESENT diff --git a/third-party/tbb/test/tbb/test_parallel_invoke.cpp b/third-party/tbb/test/tbb/test_parallel_invoke.cpp index f83fcc8b..6c94e3bc 100644 --- a/third-party/tbb/test/tbb/test_parallel_invoke.cpp +++ b/third-party/tbb/test/tbb/test_parallel_invoke.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2020-2021 Intel Corporation + Copyright (c) 2020-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -27,6 +27,8 @@ //! \file test_parallel_invoke.cpp //! \brief Test for [algorithms.parallel_invoke] +#if !EMSCRIPTEN +//! Emscripten requires preloading of the file used to determine memory usage, hence disabled. //! Testing parallel_invoke memory usage //! \brief \ref resource_usage \ref stress TEST_CASE("Test memory leaks") { @@ -56,6 +58,7 @@ TEST_CASE("Test memory leaks") { } REQUIRE_MESSAGE(false, "Seems like we get memory leak here."); } +#endif template void test_from_2_to_10_arguments(const Body& body, const std::atomic& counter) { diff --git a/third-party/tbb/test/tbb/test_partitioner.cpp b/third-party/tbb/test/tbb/test_partitioner.cpp index b78fe208..9af5009d 100644 --- a/third-party/tbb/test/tbb/test_partitioner.cpp +++ b/third-party/tbb/test/tbb/test_partitioner.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2021-2022 Intel Corporation + Copyright (c) 2021-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -44,6 +44,9 @@ template float test(PerBodyFunc&& body) { #if __TBB_USE_THREAD_SANITIZER // Reduce execution time under Thread Sanitizer const std::size_t repeats = 50; +#elif EMSCRIPTEN + // Reduce execution time for emscripten + const std::size_t repeats = 10; #else const std::size_t repeats = 100; #endif diff --git a/third-party/tbb/test/tbb/test_resumable_tasks.cpp b/third-party/tbb/test/tbb/test_resumable_tasks.cpp index a363a9ca..0cba9772 100644 --- a/third-party/tbb/test/tbb/test_resumable_tasks.cpp +++ b/third-party/tbb/test/tbb/test_resumable_tasks.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -423,6 +423,7 @@ class TestCaseGuard { thread_local bool TestCaseGuard::m_local = false; +#if !EMSCRIPTEN //! Nested test for suspend and resume //! \brief \ref error_guessing TEST_CASE("Nested test for suspend and resume") { @@ -436,6 +437,7 @@ TEST_CASE("Nested arena") { TestCaseGuard guard; TestNestedArena(); } +#endif //! Test with external threads //! \brief \ref error_guessing @@ -443,11 +445,13 @@ TEST_CASE("External threads") { TestNativeThread(); } +#if !EMSCRIPTEN //! Stress test with external threads //! \brief \ref stress TEST_CASE("Stress test with external threads") { TestCleanupMaster(); } +#endif //! Test with an arena observer //! \brief \ref error_guessing diff --git a/third-party/tbb/test/tbb/test_scheduler_mix.cpp b/third-party/tbb/test/tbb/test_scheduler_mix.cpp index c2c02bb7..8d8e0e37 100644 --- a/third-party/tbb/test/tbb/test_scheduler_mix.cpp +++ b/third-party/tbb/test/tbb/test_scheduler_mix.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2021-2022 Intel Corporation + Copyright (c) 2021-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -522,7 +522,7 @@ enum ACTIONS { num_actions }; -void global_actor(); +void global_actor(size_t arenaAfterStealing); template struct actor; @@ -543,8 +543,13 @@ struct actor { template <> struct actor { - static void do_it(Random& r) { + static void do_it(Random& r, size_t arenaAfterStealing) { static thread_local std::size_t arenaLevel = 0; + + // treat arenas index as priority: we own some resource already, + // so may pretend only to low-priority resource + arenaLevel = std::max(arenaLevel, arenaAfterStealing); + ArenaTable::ScopedLock lock; auto entry = arenaTable.acquire(r, lock); if (entry.first) { @@ -561,11 +566,13 @@ struct actor { tbb::this_task_arena::enqueue([&wctx] { wctx.release(); }); tbb::detail::d1::wait(wctx, ctx); } else { - global_actor(); + global_actor(0); } }; switch (r.get() % (16*num_arena_actions)) { case arena_execute: + // to prevent deadlock, potentially blocking operation + // may be called only for arenas with larger index if (entry.second > arenaLevel) { gStats.notify(Statistics::ArenaExecute); auto oldArenaLevel = arenaLevel; @@ -579,7 +586,9 @@ struct actor { utils_fallthrough; default: gStats.notify(Statistics::ArenaEnqueue); - entry.first->enqueue([] { global_actor(); }); + // after stealing by a worker, the task will run in arena + // with index entry.second + entry.first->enqueue([ entry ] { global_actor(entry.second); }); break; } arenaTable.release(lock); @@ -601,7 +610,7 @@ struct actor { auto doGlbAction = rnd.get() % 1000 == 42; auto body = [doGlbAction, sz](int i) { if (i == sz / 2 && doGlbAction) { - global_actor(); + global_actor(0); } }; @@ -621,7 +630,7 @@ struct actor { } }; -void global_actor() { +void global_actor(size_t arenaAfterStealing) { static thread_local std::uint64_t localNumActions{}; while (globalNumActions < maxNumActions) { @@ -629,7 +638,7 @@ void global_actor() { switch (rnd.get() % num_actions) { case arena_create: gStats.notify(Statistics::ArenaCreate); actor::do_it(rnd); break; case arena_destroy: gStats.notify(Statistics::ArenaDestroy); actor::do_it(rnd); break; - case arena_action: gStats.notify(Statistics::ArenaAcquire); actor::do_it(rnd); break; + case arena_action: gStats.notify(Statistics::ArenaAcquire); actor::do_it(rnd, arenaAfterStealing); break; case parallel_algorithm: gStats.notify(Statistics::ParallelAlgorithm); actor::do_it(rnd); break; } @@ -656,7 +665,7 @@ TEST_CASE("Stress test with mixing functionality") { utils::SpinBarrier startBarrier{numExtraThreads}; utils::NativeParallelFor(numExtraThreads, [&startBarrier](std::size_t) { startBarrier.wait(); - global_actor(); + global_actor(0); }); arenaTable.shutdown(); diff --git a/third-party/tbb/test/tbb/test_task.cpp b/third-party/tbb/test/tbb/test_task.cpp index 72f111f2..876e3510 100644 --- a/third-party/tbb/test/tbb/test_task.cpp +++ b/third-party/tbb/test/tbb/test_task.cpp @@ -771,7 +771,8 @@ TEST_CASE("Test with priority inversion") { auto high_priority_thread_func = [&] { // Increase external threads priority - utils::increase_thread_priority(); + utils::increased_priority_guard guard{}; + utils::suppress_unused_warning(guard); // pin external threads test_arena.execute([]{}); while (task_counter++ < critical_task_counter) { @@ -796,7 +797,8 @@ TEST_CASE("Test with priority inversion") { high_priority_threads.emplace_back(high_priority_thread_func); } - utils::increase_thread_priority(); + utils::increased_priority_guard guard{}; + utils::suppress_unused_warning(guard); while (task_counter++ < critical_task_counter) { submit(critical_task, test_arena, test_context, true); std::this_thread::sleep_for(std::chrono::milliseconds(1)); @@ -822,3 +824,19 @@ TEST_CASE("raii_guard move ctor") { tbb::detail::d0::raii_guard guard1(func); tbb::detail::d0::raii_guard guard2(std::move(guard1)); } + +//! \brief \ref error_guessing +TEST_CASE("Check correct arena destruction with enqueue") { + for (int i = 0; i < 100; ++i) { + tbb::task_scheduler_handle handle{ tbb::attach{} }; + { + tbb::task_arena a(2, 0); + + a.enqueue([] { + tbb::parallel_for(0, 100, [] (int) { std::this_thread::sleep_for(std::chrono::nanoseconds(10)); }); + }); + std::this_thread::sleep_for(std::chrono::microseconds(1)); + } + tbb::finalize(handle, std::nothrow_t{}); + } +} diff --git a/third-party/tbb/test/tbb/test_task_arena.cpp b/third-party/tbb/test/tbb/test_task_arena.cpp index a00ef43f..fd930f19 100644 --- a/third-party/tbb/test/tbb/test_task_arena.cpp +++ b/third-party/tbb/test/tbb/test_task_arena.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #define __TBB_EXTRA_DEBUG 1 #include "common/concurrency_tracker.h" +#include "common/cpu_usertime.h" #include "common/spin_barrier.h" #include "common/utils.h" #include "common/utils_report.h" @@ -33,12 +34,13 @@ #include "tbb/spin_rw_mutex.h" #include "tbb/task_group.h" -#include -#include +#include +#include #include -#include +#include +#include #include -#include +#include //#include "harness_fp.h" @@ -446,10 +448,10 @@ class TestArenaConcurrencyBody : utils::NoAssign { // Arena's functor void operator()() const { int idx = tbb::this_task_arena::current_thread_index(); - CHECK( idx < (my_max_concurrency > 1 ? my_max_concurrency : 2) ); - CHECK( my_a.max_concurrency() == tbb::this_task_arena::max_concurrency() ); + REQUIRE( idx < (my_max_concurrency > 1 ? my_max_concurrency : 2) ); + REQUIRE( my_a.max_concurrency() == tbb::this_task_arena::max_concurrency() ); int max_arena_concurrency = tbb::this_task_arena::max_concurrency(); - CHECK( max_arena_concurrency == my_max_concurrency ); + REQUIRE( max_arena_concurrency == my_max_concurrency ); if ( my_worker_barrier ) { if ( local_id.local() == 1 ) { // External thread in a reserved slot @@ -748,7 +750,7 @@ namespace TestIsolatedExecuteNS { for ( int i = 0; i <= max_repeats; ++i ) { OuterParFor( outer_isolation, is_stolen )(); } - REQUIRE_MESSAGE( !is_stolen, "isolate() on nested levels should prevent stealing from outer leves" ); + REQUIRE_MESSAGE( !is_stolen, "isolate() on nested levels should prevent stealing from outer levels" ); } } @@ -1123,7 +1125,7 @@ void TestMultipleWaits( int num_threads, int num_bunches, int bunch_size ) { for ( int repeats = 0; repeats<10; ++repeats ) { int idx = 0; for ( int bunch = 0; bunch < num_bunches-1; ++bunch ) { - // Sync with the previous bunch of waiters to prevent "false" nested dependicies (when a nested task waits for an outer task). + // Sync with the previous bunch of waiters to prevent "false" nested dependencies (when a nested task waits for an outer task). while ( processed < bunch*bunch_size ) utils::yield(); // Run the bunch of threads/waiters that depend on the next bunch of threads/waiters. for ( int i = 0; i& my_task_counter; }; +void test_threads_sleep(int concurrency, int reserved_slots, int num_external_threads) { + tbb::task_arena a(concurrency, reserved_slots); + std::mutex m; + std::condition_variable cond_var; + bool completed{ false }; + utils::SpinBarrier barrier( concurrency - reserved_slots + 1 ); + + auto body = [&] { + std::unique_lock lock(m); + cond_var.wait(lock, [&] { return completed == true; }); + }; + + for (int i = 0; i < concurrency - reserved_slots; ++i) { + a.enqueue([&] { + body(); + barrier.signalNoWait(); + }); + } + std::vector threads; + for (int i = 0; i < num_external_threads; ++i) { + threads.emplace_back([&]() { a.execute(body); }); + } + TestCPUUserTime(concurrency); + + { + std::lock_guard lock(m); + completed = true; + cond_var.notify_all(); + } + for (auto& t : threads) { + t.join(); + } + barrier.wait(); +} + +void test_threads_sleep(int concurrency, int reserved_slots) { + test_threads_sleep(concurrency, reserved_slots, reserved_slots); + test_threads_sleep(concurrency, reserved_slots, 2 * concurrency); +} + //--------------------------------------------------// // This test requires TBB in an uninitialized state @@ -1789,11 +1831,14 @@ TEST_CASE("Test for concurrent functionality") { TestConcurrentFunctionality(); } +#if !EMSCRIPTEN +//! For emscripten, FPU control state has not been set correctly //! Test for arena entry consistency //! \brief \ref requirement \ref error_guessing TEST_CASE("Test for task arena entry consistency") { TestArenaEntryConsistency(); } +#endif //! Test for task arena attach functionality //! \brief \ref requirement \ref interface @@ -1825,6 +1870,8 @@ TEST_CASE("Delegated spawn wait") { TestDelegatedSpawnWait(); } +#if !EMSCRIPTEN +//! For emscripten, FPU control state has not been set correctly //! Test task arena isolation functionality //! \brief \ref requirement \ref interface TEST_CASE("Isolated execute") { @@ -1833,6 +1880,7 @@ TEST_CASE("Isolated execute") { TestIsolatedExecute(); } } +#endif //! Test for TBB Workers creation limits //! \brief \ref requirement @@ -1846,11 +1894,14 @@ TEST_CASE("Arena workers migration") { TestArenaWorkersMigration(); } +#if !EMSCRIPTEN +//! For emscripten, FPU control state has not been set correctly //! Test for multiple waits, threads should not block each other //! \brief \ref requirement TEST_CASE("Multiple waits") { TestMultipleWaits(); } +#endif //! Test for small stack size settings and arena initialization //! \brief \ref error_guessing @@ -1884,10 +1935,12 @@ TEST_CASE("Exception thrown during tbb::task_arena::execute call") { }(), std::exception ); } #endif // TBB_USE_EXCEPTIONS + //! \brief \ref stress TEST_CASE("Stress test with mixing functionality") { StressTestMixFunctionality(); } + //! \brief \ref stress TEST_CASE("Workers oversubscription") { std::size_t num_threads = utils::get_platform_max_threads(); @@ -1924,6 +1977,7 @@ TEST_CASE("Workers oversubscription") { ); }); } + #if TBB_USE_EXCEPTIONS //! The test for error in scheduling empty task_handle //! \brief \ref requirement @@ -1938,6 +1992,20 @@ TEST_CASE("Empty task_handle cannot be scheduled" } #endif +#if !EMSCRIPTEN +//! For emscripten, FPU control state has not been set correctly +//! \brief \ref error_guessing +TEST_CASE("Test threads sleep") { + for (auto concurrency_level : utils::concurrency_range()) { + int conc = int(concurrency_level); + test_threads_sleep(conc, 0); + test_threads_sleep(conc, 1); + test_threads_sleep(conc, conc/2); + test_threads_sleep(conc, conc); + } +} +#endif + #if __TBB_PREVIEW_TASK_GROUP_EXTENSIONS //! Basic test for is_inside_task in task_group @@ -1979,3 +2047,21 @@ TEST_CASE("is_inside_task in arena::execute") { }); } #endif //__TBB_PREVIEW_TASK_GROUP_EXTENSIONS + +//! \brief \ref interface \ref requirement \ref regression +TEST_CASE("worker threads occupy slots in correct range") { + std::vector arenas(42); + for (auto& arena : arenas) { + arena.initialize(1, 0); + } + + std::atomic counter{0}; + for (auto& arena : arenas) { + arena.enqueue([&] { + CHECK(tbb::this_task_arena::current_thread_index() == 0); + ++counter; + }); + } + + while (counter < 42) { utils::yield(); } +} diff --git a/third-party/tbb/test/tbb/test_task_group.cpp b/third-party/tbb/test/tbb/test_task_group.cpp index e1052b9c..d39b4fc7 100644 --- a/third-party/tbb/test/tbb/test_task_group.cpp +++ b/third-party/tbb/test/tbb/test_task_group.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -99,8 +99,10 @@ class SharedGroupBodyImpl : utils::NoCopy, utils::NoAfterlife { utils::ConcurrencyTracker ct; m_taskGroup->wait(); } - if ( utils::ConcurrencyTracker::PeakParallelism() == 1 ) - WARN( "Warning: No parallel waiting detected in TestParallelWait" ); + if ( utils::ConcurrencyTracker::PeakParallelism() == 1 ) { + const char* msg = "Warning: No parallel waiting detected in TestParallelWait"; + WARN( msg ); + } m_barrier.wait(); } else @@ -753,7 +755,8 @@ namespace TestIsolationNS { //! Test for thread safety for the task_group //! \brief \ref error_guessing \ref resource_usage TEST_CASE("Memory leaks test is not applicable under ASAN\n" * doctest::skip(true)) {} -#else +#elif !EMSCRIPTEN +//! Emscripten requires preloading of the file used to determine memory usage, hence disabled. //! Test for thread safety for the task_group //! \brief \ref error_guessing \ref resource_usage TEST_CASE("Thread safety test for the task group") { @@ -821,7 +824,7 @@ TEST_CASE("Move semantics test for the task group") { //! Test for thread safety for the isolated_task_group //! \brief \ref error_guessing TEST_CASE("Memory leaks test is not applicable under ASAN\n" * doctest::skip(true)) {} -#else +#elif !EMSCRIPTEN //! Test for thread safety for the isolated_task_group //! \brief \ref error_guessing TEST_CASE("Thread safety test for the isolated task group") { @@ -925,58 +928,60 @@ TEST_CASE("Test for stack overflow avoidance mechanism within arena") { return; } - tbb::global_control thread_limit(tbb::global_control::max_allowed_parallelism, 2); - tbb::task_group tg1; - tbb::task_group tg2; - std::atomic tasks_executed{}; - - // Determine nested task execution limit. - int second_thread_executed{}; - tg1.run_and_wait([&tg1, &tg2, &tasks_executed, &second_thread_executed] { - run_deep_stealing(tg1, tg2, 10000, tasks_executed); - do { - second_thread_executed = tasks_executed; - utils::Sleep(10); - } while (second_thread_executed < 100 || second_thread_executed != tasks_executed); - CHECK(tasks_executed < 10000); - }); - tg2.wait(); - CHECK(tasks_executed == 10000); + tbb::task_arena a1(2, 1); + a1.execute([] { + tbb::task_group tg1; + tbb::task_group tg2; + std::atomic tasks_executed{}; - tasks_executed = 0; - tbb::task_arena a(2, 2); - tg1.run_and_wait([&a, &tg1, &tg2, &tasks_executed, second_thread_executed] { - run_deep_stealing(tg1, tg2, second_thread_executed-1, tasks_executed); - while (tasks_executed < second_thread_executed-1) { - // Wait until the second thread near the limit. - utils::yield(); - } - tg2.run([&a, &tg1, &tasks_executed] { - a.execute([&tg1, &tasks_executed] { - volatile char consume_stack[1000]{}; - ++tasks_executed; - tg1.wait(); - utils::suppress_unused_warning(consume_stack); - }); - }); - while (tasks_executed < second_thread_executed) { - // Wait until the second joins the arena. - utils::yield(); - } - a.execute([&tg1, &tg2, &tasks_executed] { + // Determine nested task execution limit. + int second_thread_executed{}; + tg1.run_and_wait([&tg1, &tg2, &tasks_executed, &second_thread_executed] { run_deep_stealing(tg1, tg2, 10000, tasks_executed); + do { + second_thread_executed = tasks_executed; + utils::Sleep(10); + } while (second_thread_executed < 100 || second_thread_executed != tasks_executed); + CHECK(tasks_executed < 10000); }); - int currently_executed{}; - do { - currently_executed = tasks_executed; - utils::Sleep(10); - } while (currently_executed != tasks_executed); - CHECK(tasks_executed < 10000 + second_thread_executed); - }); - a.execute([&tg2] { tg2.wait(); + CHECK(tasks_executed == 10000); + + tasks_executed = 0; + tbb::task_arena a2(2, 2); + tg1.run_and_wait([&a2, &tg1, &tg2, &tasks_executed, second_thread_executed] { + run_deep_stealing(tg1, tg2, second_thread_executed - 1, tasks_executed); + while (tasks_executed < second_thread_executed - 1) { + // Wait until the second thread near the limit. + utils::yield(); + } + tg2.run([&a2, &tg1, &tasks_executed] { + a2.execute([&tg1, &tasks_executed] { + volatile char consume_stack[1000]{}; + ++tasks_executed; + tg1.wait(); + utils::suppress_unused_warning(consume_stack); + }); + }); + while (tasks_executed < second_thread_executed) { + // Wait until the second joins the arena. + utils::yield(); + } + a2.execute([&tg1, &tg2, &tasks_executed] { + run_deep_stealing(tg1, tg2, 10000, tasks_executed); + }); + int currently_executed{}; + do { + currently_executed = tasks_executed; + utils::Sleep(10); + } while (currently_executed != tasks_executed); + CHECK(tasks_executed < 10000 + second_thread_executed); + }); + a2.execute([&tg2] { + tg2.wait(); + }); + CHECK(tasks_executed == 10000 + second_thread_executed); }); - CHECK(tasks_executed == 10000 + second_thread_executed); } //! Test checks that we can submit work to task_group asynchronously with waiting. diff --git a/third-party/tbb/test/tbb/test_tbb_header.cpp b/third-party/tbb/test/tbb/test_tbb_header.cpp index 3270d5ed..a671de1c 100644 --- a/third-party/tbb/test/tbb/test_tbb_header.cpp +++ b/third-party/tbb/test/tbb/test_tbb_header.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -227,6 +227,8 @@ static void DefinitionPresence() { TestTypeDefinitionPresence( queuing_rw_mutex ); TestTypeDefinitionPresence( spin_mutex ); TestTypeDefinitionPresence( spin_rw_mutex ); + TestTypeDefinitionPresence( mutex ); + TestTypeDefinitionPresence( rw_mutex ); TestTypeDefinitionPresence( speculative_spin_mutex ); TestTypeDefinitionPresence( speculative_spin_rw_mutex ); TestTypeDefinitionPresence( task_group_context ); diff --git a/third-party/tbb/test/tbbmalloc/test_malloc_compliance.cpp b/third-party/tbb/test/tbbmalloc/test_malloc_compliance.cpp index 224e2476..1a85ed58 100644 --- a/third-party/tbb/test/tbbmalloc/test_malloc_compliance.cpp +++ b/third-party/tbb/test/tbbmalloc/test_malloc_compliance.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -30,6 +30,10 @@ #include "oneapi/tbb/detail/_config.h" +// There is no RLIMIT_AS on OpenBSD. +// Therefore, the tests for memory limit is unreasonable. +#if !__OpenBSD__ + #define __TBB_NO_IMPLICIT_LINKAGE 1 #include "tbb/scalable_allocator.h" @@ -1091,3 +1095,4 @@ TEST_CASE("MAIN TEST") { } #endif /* __TBB_WIN8UI_SUPPORT */ +#endif /* Enable test */ diff --git a/third-party/tbb/test/tbbmalloc/test_malloc_whitebox.cpp b/third-party/tbb/test/tbbmalloc/test_malloc_whitebox.cpp index 78a7a3ac..9de151e0 100644 --- a/third-party/tbb/test/tbbmalloc/test_malloc_whitebox.cpp +++ b/third-party/tbb/test/tbbmalloc/test_malloc_whitebox.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -459,7 +459,7 @@ void TestPools() { passBackendSz Byte objects are cached in LOC, but bypassed the backend, so memory requested directly from allocation callback. nextPassBackendSz Byte objects must fit to another LOC bin, - so that their allocation/realeasing leads to cache cleanup. + so that their allocation/releasing leads to cache cleanup. All this is expecting to lead to releasing of passBackendSz Byte object from LOC during LOC cleanup, and putMallocMem checks that returned size is correct. @@ -1257,7 +1257,11 @@ void TestTHP() { scalable_allocation_mode(USE_HUGE_PAGES, 1); REQUIRE_MESSAGE(hugePages.isEnabled, "Huge pages should be enabled via scalable_allocation_mode"); +#if defined __loongarch64 + const int HUGE_PAGE_SIZE = 32 * 1024 * 1024; +#else const int HUGE_PAGE_SIZE = 2 * 1024 * 1024; +#endif // allocCount transparent huge pages should be allocated const int allocCount = 10; @@ -1362,7 +1366,7 @@ void TestReallocDecreasing() { reallocPtr = reallocAndRetrieve(origPtr, reallocSize, origBlockSize, reallocBlockSize); - REQUIRE_MESSAGE(origBlockSize > reallocBlockSize, "Reallocated block size should descrease."); + REQUIRE_MESSAGE(origBlockSize > reallocBlockSize, "Reallocated block size should decrease."); size_t sysMemUsageAfter = getStabilizedMemUsage(); size_t totalMemSizeAfter = defaultMemPool->extMemPool.backend.getTotalMemSize(); diff --git a/third-party/tbb/third-party-programs.txt b/third-party/tbb/third-party-programs.txt index b555450a..c088429c 100644 --- a/third-party/tbb/third-party-programs.txt +++ b/third-party/tbb/third-party-programs.txt @@ -1,58 +1,55 @@ oneAPI Threading Building Blocks (oneTBB) Third Party Programs File -This file contains the list of third party software ("third party programs") -contained in the Intel software and their required notices and/or license terms. -This third party software, even if included with the distribution of the Intel -software, may be governed by separate license terms, including without limitation, -third party license terms, other Intel software license terms, and open source -software license terms. These separate license terms govern your use of the third -party programs as set forth in the "third-party-programs.txt" or other similarlynamed text file. +This file is the "third-party-programs.txt" file specified in the associated Intel end user license +agreement for the Intel software you are licensing. The third party programs and their corresponding required notices and/or license terms are listed below. _______________________________________________________________________________________________________ -1. Intel(R) Instrumentation and Tracing Technology (ITT) - Copyright (c) 2022 Intel Corporation. All rights reserved. +1. Instrumentation and Tracing Technology (ITT) Notify User API: + Copyright (c) 2005-2023 Intel Corporation. All rights reserved. - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - 3. Neither the name of the copyright holder nor the names of its contributors may be - used to endorse or promote products derived from this software - without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - SUCH DAMAGE. + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _______________________________________________________________________________________________________ -2. ActiveState Thread pool with same API as (multi) processing.Pool (Python recipe): - Copyright (c) 2008,2016 david decotigny (this file) - Copyright (c) 2006-2008, R Oudkerk (multiprocessing.Pool) +2. Portable Hardware Locality (hwloc): - Portable Hardware Locality (hwloc) - Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana University Research and Technology Corporation. All rights reserved. - Copyright (c) 2004-2005 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. - Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, University of Stuttgart. All rights reserved. + Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana University Research and + Technology Corporation. All rights reserved. + Copyright (c) 2004-2005 The University of Tennessee and The University of Tennessee Research + Foundation. All rights reserved. + Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, University of Stuttgart. + All rights reserved. Copyright (c) 2004-2005 The Regents of the University of California. All rights reserved. Copyright (c) 2009 CNRS Copyright (c) 2009-2016 Inria. All rights reserved. - Copyright (c) 2009-2015 Université Bordeaux + Copyright (c) 2009-2015 Université Bordeaux Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved. Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved. Copyright (c) 2010 IBM @@ -60,35 +57,32 @@ ________________________________________________________________________________ Copyright (c) 2012 Aleksej Saushev, The NetBSD Foundation Copyright (c) 2012 Blue Brain Project, EPFL. All rights reserved. Copyright (c) 2013-2014 University of Wisconsin-La Crosse. All rights reserved. - Copyright (c) 2015 Research Organization for Information Science and Technology (RIST). All rights reserved. + Copyright (c) 2015 Research Organization for Information Science and Technology (RIST). + All rights reserved. Copyright (c) 2015-2016 Intel, Inc. All rights reserved. - - BSD 3-clause "New" or "Revised" License + See COPYING in top-level directory. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of author nor the names of any contributors may be - used to endorse or promote products derived from this software - without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - SUCH DAMAGE. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _______________________________________________________________________________________________________ 3. gperftools: Copyright (c) 2011, Google Inc. @@ -126,268 +120,60 @@ ________________________________________________________________________________ 4. Mateusz Kwiatkowski Workaround for bug 62258 in libstdc++ - GPL 3.0 with GCC Runtime Library Exception 3.1 - - GNU GENERAL PUBLIC LICENSE - - Version 3, 29 June 2007 - - Copyright (c) 2007 Free Software Foundation, Inc. - - Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. - - Preamble - The GNU General Public License is a free, copyleft license for software and other kinds of works. - - The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. - - When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. - - Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. - - Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and modification follow. - - TERMS AND CONDITIONS - 0. Definitions. - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based on the Program. - - To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. - - 1. Source Code. - The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. - - A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. - - The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. - - The Corresponding Source for a work in source code form is that same work. - - 2. Basic Permissions. - All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. - - When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. - - 4. Conveying Verbatim Copies. - You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified it, and giving a relevant date. - b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". - c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. - d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. - A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. - - 6. Conveying Non-Source Forms. - You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: - - a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. - b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. - c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. - d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. - e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. - A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. - - If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). - - The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. - - 7. Additional Terms. - "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or - b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or - c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or - d) Limiting the use for publicity purposes of names of licensors or authors of the material; or - e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or - f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. - All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. - - 8. Termination. - You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). - - However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. - - Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. - - 9. Acceptance Not Required for Having Copies. - You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. - - 11. Patents. - A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. - - If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. - - A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. - - 14. Revised Versions of this License. - The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. - - Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. - - Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. - - 15. Disclaimer of Warranty. - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . - - The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . - - - GCC RUNTIME LIBRARY EXCEPTION - - Version 3.1, 31 March 2009 - - Copyright (c) 2009 Free Software Foundation, Inc. - - Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. - - This GCC Runtime Library Exception ("Exception") is an additional permission under section 7 of the GNU General Public License, version 3 ("GPLv3"). It applies to a given file (the "Runtime Library") that bears a notice placed by the copyright holder of the file stating that the file is governed by GPLv3 along with this Exception. - - When you use GCC to compile a program, GCC may combine portions of certain GCC header files and runtime libraries with the compiled program. The purpose of this Exception is to allow compilation of non-GPL (including proprietary) programs to use, in this way, the header files and runtime libraries covered by this Exception. - - 0. Definitions. - A file is an "Independent Module" if it either requires the Runtime Library for execution after a Compilation Process, or makes use of an interface provided by the Runtime Library, but is not otherwise based on the Runtime Library. - - "GCC" means a version of the GNU Compiler Collection, with or without modifications, governed by version 3 (or a specified later version) of the GNU General Public License (GPL) with the option of using any subsequent versions published by the FSF. - - "GPL-compatible Software" is software whose conditions of propagation, modification and use would permit combination with GCC in accord with the license of GCC. - - "Target Code" refers to output from any compiler for a real or virtual target processor architecture, in executable form or suitable for input to an assembler, loader, linker and/or execution phase. Notwithstanding that, Target Code does not include data in any format that is used as a compiler intermediate representation, or used for producing a compiler intermediate representation. - - The "Compilation Process" transforms code entirely represented in non-intermediate languages designed for human-written code, and/or in Java Virtual Machine byte code, into Target Code. Thus, for example, use of source code generators and preprocessors need not be considered part of the Compilation Process, since the Compilation Process can be understood as starting with the output of the generators or preprocessors. - - A Compilation Process is "Eligible" if it is done using GCC, alone or with other GPL-compatible software, or if it is done without using any work based on GCC. For example, using non-GPL-compatible Software to optimize any GCC intermediate representations would not qualify as an Eligible Compilation Process. - - 1. Grant of Additional Permission. - You have permission to propagate a work of Target Code formed by combining the Runtime Library with Independent Modules, even if such propagation would otherwise violate the terms of GPLv3, provided that all Target Code was generated by Eligible Compilation Processes. You may then convey such a combination under terms of your choice, consistent with the licensing of the Independent Modules. + ******************************************************************************** + * Author: Mateusz Kwiatkowski * + * * + * I hereby renounce all copyright to this file and my rights resulting from * + * it, to the broadest extent permitted by law. It may be treated as public * + * domain. * + * * + * However, as this file interfaces with GCC internal ABI, it may be subject to * + * the terms and conditions of the GNU General Public License. Please consult * + * the GCC licensing terms and/or a lawyer for details. * + * * + * Note that libstdc++ licensing terms grant additional permissions described * + * in the GCC Runtime Library Exception, version 3.1, as published by the * + * Free Software Foundation. * + *******************************************************************************/ +_______________________________________________________________________________________________________ - 2. No Weakening of GCC Copyleft. - The availability of this Exception does not imply any general presumption that third-party software is unaffected by the copyleft requirements of the license of GCC. +5. ActiveState Thread pool with same API as (multi) processing. Pool (Python recipe) + + # + # Copyright (c) 2008,2016 david decotigny (this file) + # Copyright (c) 2006-2008, R Oudkerk (multiprocessing.Pool) + # All rights reserved. + # + # Redistribution and use in source and binary forms, with or without + # modification, are permitted provided that the following conditions + # are met: + # + # 1. Redistributions of source code must retain the above copyright + # notice, this list of conditions and the following disclaimer. + # 2. Redistributions in binary form must reproduce the above copyright + # notice, this list of conditions and the following disclaimer in the + # documentation and/or other materials provided with the distribution. + # 3. Neither the name of author nor the names of any contributors may be + # used to endorse or promote products derived from this software + # without specific prior written permission. + # + # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND + # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + # SUCH DAMAGE. _______________________________________________________________________________________________________ -5. Doctest - - Copyright (c) 2016-2021 Viktor Kirilov +6. doctest - The MIT License (MIT) + Copyright (c) 2016-2023 Viktor Kirilov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -406,6 +192,7 @@ ________________________________________________________________________________ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + _______________________________________________________________________________________________________ -*Other names and brands may be claimed as the property of others. +*Other names and brands may be claimed as the property of others. \ No newline at end of file diff --git a/third-party/zlib/.github/workflows/cmake.yml b/third-party/zlib/.github/workflows/cmake.yml deleted file mode 100644 index d15fda86..00000000 --- a/third-party/zlib/.github/workflows/cmake.yml +++ /dev/null @@ -1,89 +0,0 @@ -name: CMake -on: [push, pull_request] -jobs: - ci-cmake: - name: ${{ matrix.name }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - include: - - name: Ubuntu GCC - os: ubuntu-latest - compiler: gcc - - # Test out of source builds - - name: Ubuntu GCC OSB - os: ubuntu-latest - compiler: gcc - build-dir: ../build - src-dir: ../zlib - - - name: Ubuntu GCC -O3 - os: ubuntu-latest - compiler: gcc - cflags: -O3 - - - name: Ubuntu Clang - os: ubuntu-latest - compiler: clang - - - name: Ubuntu Clang Debug - os: ubuntu-latest - compiler: clang - build-config: Debug - - - name: Windows MSVC Win32 - os: windows-latest - compiler: cl - cmake-args: -A Win32 - - - name: Windows MSVC Win64 - os: windows-latest - compiler: cl - cmake-args: -A x64 - - - name: Windows GCC - os: windows-latest - compiler: gcc - cmake-args: -G Ninja - - - name: macOS Clang - os: macos-latest - compiler: clang - - - name: macOS GCC - os: macos-latest - compiler: gcc-11 - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Install packages (Windows) - if: runner.os == 'Windows' - run: | - choco install --no-progress ninja ${{ matrix.packages }} - - - name: Generate project files - run: cmake -S ${{ matrix.src-dir || '.' }} -B ${{ matrix.build-dir || '.' }} ${{ matrix.cmake-args }} -D CMAKE_BUILD_TYPE=${{ matrix.build-config || 'Release' }} - env: - CC: ${{ matrix.compiler }} - CFLAGS: ${{ matrix.cflags }} - - - name: Compile source code - run: cmake --build ${{ matrix.build-dir || '.' }} --config ${{ matrix.build-config || 'Release' }} - - - name: Run test cases - run: ctest -C Release --output-on-failure --max-width 120 - working-directory: ${{ matrix.build-dir || '.' }} - - - name: Upload build errors - uses: actions/upload-artifact@v3 - if: failure() - with: - name: ${{ matrix.name }} (cmake) - path: | - **/CMakeFiles/CMakeOutput.log - **/CMakeFiles/CMakeError.log - retention-days: 7 diff --git a/third-party/zlib/.github/workflows/configure.yml b/third-party/zlib/.github/workflows/configure.yml deleted file mode 100644 index e7839211..00000000 --- a/third-party/zlib/.github/workflows/configure.yml +++ /dev/null @@ -1,136 +0,0 @@ -name: Configure -on: [push, pull_request] -jobs: - ci-configure: - name: ${{ matrix.name }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - include: - - name: Ubuntu GCC - os: ubuntu-latest - compiler: gcc - configure-args: --warn - - # Test out of source builds - - name: Ubuntu GCC OSB - os: ubuntu-latest - compiler: gcc - configure-args: --warn - build-dir: ../build - src-dir: ../zlib - - - name: Ubuntu GCC ARM SF - os: ubuntu-latest - compiler: arm-linux-gnueabi-gcc - configure-args: --warn - chost: arm-linux-gnueabi - packages: qemu qemu-user gcc-arm-linux-gnueabi libc-dev-armel-cross - qemu-run: qemu-arm -L /usr/arm-linux-gnueabi - - - name: Ubuntu GCC ARM HF - os: ubuntu-latest - compiler: arm-linux-gnueabihf-gcc - configure-args: --warn - chost: arm-linux-gnueabihf - packages: qemu qemu-user gcc-arm-linux-gnueabihf libc-dev-armhf-cross - qemu-run: qemu-arm -L /usr/arm-linux-gnueabihf - - - name: Ubuntu GCC AARCH64 - os: ubuntu-latest - compiler: aarch64-linux-gnu-gcc - configure-args: --warn - chost: aarch64-linux-gnu - packages: qemu qemu-user gcc-aarch64-linux-gnu libc-dev-arm64-cross - qemu-run: qemu-aarch64 -L /usr/aarch64-linux-gnu - - - name: Ubuntu GCC PPC - os: ubuntu-latest - compiler: powerpc-linux-gnu-gcc - configure-args: --warn --static - chost: powerpc-linux-gnu - packages: qemu qemu-user gcc-powerpc-linux-gnu libc-dev-powerpc-cross - qemu-run: qemu-ppc -L /usr/powerpc-linux-gnu - cflags: -static - ldflags: -static - - - name: Ubuntu GCC PPC64 - os: ubuntu-latest - compiler: powerpc64-linux-gnu-gcc - configure-args: --warn --static - chost: powerpc-linux-gnu - packages: qemu qemu-user gcc-powerpc64-linux-gnu libc-dev-ppc64-cross - qemu-run: qemu-ppc64 -L /usr/powerpc64-linux-gnu - cflags: -static - ldflags: -static - - - name: Ubuntu GCC PPC64LE - os: ubuntu-latest - compiler: powerpc64le-linux-gnu-gcc - configure-args: --warn - chost: powerpc64le-linux-gnu - packages: qemu qemu-user gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross - qemu-run: qemu-ppc64le -L /usr/powerpc64le-linux-gnu - - - name: Ubuntu GCC S390X - os: ubuntu-latest - compiler: s390x-linux-gnu-gcc - configure-args: --warn --static - chost: s390x-linux-gnu - packages: qemu qemu-user gcc-s390x-linux-gnu libc-dev-s390x-cross - qemu-run: qemu-s390x -L /usr/s390x-linux-gnu - cflags: -static - ldflags: -static - - - name: macOS GCC - os: macos-latest - compiler: gcc-11 - configure-args: --warn - - - name: macOS Clang - os: macos-latest - compiler: clang - configure-args: --warn - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Install packages (Ubuntu) - if: runner.os == 'Linux' && matrix.packages - run: | - sudo apt-get update - sudo apt-get install -y ${{ matrix.packages }} - - - name: Generate project files - run: | - [ -d ${{ matrix.build-dir || '.' }} ] || mkdir ${{ matrix.build-dir || '.' }} - cd ${{ matrix.build-dir || '.' }} - ${{ matrix.src-dir || '.' }}/configure ${{ matrix.configure-args }} - env: - CC: ${{ matrix.compiler }} - CFLAGS: ${{ matrix.cflags }} - LDFLAGS: ${{ matrix.ldflags }} - CHOST: ${{ matrix.chost }} - - - name: Compile source code - run: make -j2 - working-directory: ${{ matrix.build-dir }} - - - name: Run test cases - run: | - make test - make cover - working-directory: ${{ matrix.build-dir }} - env: - QEMU_RUN: ${{ matrix.qemu-run }} - - - name: Upload build errors - uses: actions/upload-artifact@v3 - if: failure() - with: - name: ${{ matrix.name }} (configure) - path: | - ${{ matrix.build-dir || '.' }}/configure.log - retention-days: 7 diff --git a/third-party/zlib/.github/workflows/fuzz.yml b/third-party/zlib/.github/workflows/fuzz.yml deleted file mode 100644 index 48cd2b9f..00000000 --- a/third-party/zlib/.github/workflows/fuzz.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: OSS-Fuzz -on: [pull_request] -jobs: - Fuzzing: - runs-on: ubuntu-latest - steps: - - name: Build Fuzzers - uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master - with: - oss-fuzz-project-name: 'zlib' - dry-run: false - - - name: Run Fuzzers - uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master - with: - oss-fuzz-project-name: 'zlib' - fuzz-seconds: 300 - dry-run: false - - - name: Upload Crash - uses: actions/upload-artifact@v3 - if: failure() - with: - name: artifacts - path: ./out/artifacts diff --git a/third-party/zlib/.gitignore b/third-party/zlib/.gitignore deleted file mode 100644 index b1c7422f..00000000 --- a/third-party/zlib/.gitignore +++ /dev/null @@ -1,26 +0,0 @@ -*.diff -*.patch -*.orig -*.rej - -*~ -*.a -*.lo -*.o -*.dylib - -*.gcda -*.gcno -*.gcov - -/example -/example64 -/examplesh -/libz.so* -/minigzip -/minigzip64 -/minigzipsh -/zlib.pc -/configure.log - -.DS_Store diff --git a/third-party/zlib/CMakeLists.txt b/third-party/zlib/CMakeLists.txt index eb07a6f3..15ceebe7 100644 --- a/third-party/zlib/CMakeLists.txt +++ b/third-party/zlib/CMakeLists.txt @@ -3,7 +3,9 @@ set(CMAKE_ALLOW_LOOSE_LOOP_CONSTRUCTS ON) project(zlib C) -set(VERSION "1.3") +set(VERSION "1.3.1") + +option(ZLIB_BUILD_EXAMPLES "Enable Zlib Examples" ON) set(INSTALL_BIN_DIR "${CMAKE_INSTALL_PREFIX}/bin" CACHE PATH "Installation directory for executables") set(INSTALL_LIB_DIR "${CMAKE_INSTALL_PREFIX}/lib" CACHE PATH "Installation directory for libraries") @@ -148,7 +150,9 @@ if(MINGW) endif(MINGW) add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) +target_include_directories(zlib PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) +target_include_directories(zlibstatic PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL) set_target_properties(zlib PROPERTIES SOVERSION 1) @@ -166,7 +170,7 @@ endif() if(UNIX) # On unix-like platforms the library is almost always called libz set_target_properties(zlib zlibstatic PROPERTIES OUTPUT_NAME z) - if(NOT APPLE) + if(NOT APPLE AND NOT(CMAKE_SYSTEM_NAME STREQUAL AIX)) set_target_properties(zlib PROPERTIES LINK_FLAGS "-Wl,--version-script,\"${CMAKE_CURRENT_SOURCE_DIR}/zlib.map\"") endif() elseif(BUILD_SHARED_LIBS AND WIN32) @@ -189,3 +193,26 @@ endif() if(NOT SKIP_INSTALL_FILES AND NOT SKIP_INSTALL_ALL ) install(FILES ${ZLIB_PC} DESTINATION "${INSTALL_PKGCONFIG_DIR}") endif() + +#============================================================================ +# Example binaries +#============================================================================ +if(ZLIB_BUILD_EXAMPLES) + add_executable(example test/example.c) + target_link_libraries(example zlib) + add_test(example example) + + add_executable(minigzip test/minigzip.c) + target_link_libraries(minigzip zlib) + + if(HAVE_OFF64_T) + add_executable(example64 test/example.c) + target_link_libraries(example64 zlib) + set_target_properties(example64 PROPERTIES COMPILE_FLAGS "-D_FILE_OFFSET_BITS=64") + add_test(example64 example64) + + add_executable(minigzip64 test/minigzip.c) + target_link_libraries(minigzip64 zlib) + set_target_properties(minigzip64 PROPERTIES COMPILE_FLAGS "-D_FILE_OFFSET_BITS=64") + endif() +endif() diff --git a/third-party/zlib/ChangeLog b/third-party/zlib/ChangeLog index 8707988a..b801a103 100644 --- a/third-party/zlib/ChangeLog +++ b/third-party/zlib/ChangeLog @@ -1,6 +1,16 @@ ChangeLog file for zlib +Changes in 1.3.1 (22 Jan 2024) +- Reject overflows of zip header fields in minizip +- Fix bug in inflateSync() for data held in bit buffer +- Add LIT_MEM define to use more memory for a small deflate speedup +- Fix decision on the emission of Zip64 end records in minizip +- Add bounds checking to ERR_MSG() macro, used by zError() +- Neutralize zip file traversal attacks in miniunz +- Fix a bug in ZLIB_DEBUG compiles in check_match() +- Various portability and appearance improvements + Changes in 1.3 (18 Aug 2023) - Remove K&R function definitions and zlib2ansi - Fix bug in deflateBound() for level 0 and memLevel 9 diff --git a/third-party/zlib/FAQ b/third-party/zlib/FAQ index 55f1cdc2..92f5d3e2 100644 --- a/third-party/zlib/FAQ +++ b/third-party/zlib/FAQ @@ -14,8 +14,7 @@ The latest zlib FAQ is at http://zlib.net/zlib_faq.html 2. Where can I get a Windows DLL version? The zlib sources can be compiled without change to produce a DLL. See the - file win32/DLL_FAQ.txt in the zlib distribution. Pointers to the - precompiled DLL are found in the zlib web site at http://zlib.net/ . + file win32/DLL_FAQ.txt in the zlib distribution. 3. Where can I get a Visual Basic interface to zlib? diff --git a/third-party/zlib/Makefile.in b/third-party/zlib/Makefile.in index 34d3cd72..cb8b00a9 100644 --- a/third-party/zlib/Makefile.in +++ b/third-party/zlib/Makefile.in @@ -1,5 +1,5 @@ # Makefile for zlib -# Copyright (C) 1995-2017 Jean-loup Gailly, Mark Adler +# Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler # For conditions of distribution and use, see copyright notice in zlib.h # To compile and test, type: @@ -22,13 +22,13 @@ CFLAGS=-O SFLAGS=-O LDFLAGS= -TEST_LDFLAGS=$(LDFLAGS) -L. libz.a +TEST_LIBS=-L. libz.a LDSHARED=$(CC) CPP=$(CC) -E STATICLIB=libz.a SHAREDLIB=libz.so -SHAREDLIBV=libz.so.1.3 +SHAREDLIBV=libz.so.1.3.1 SHAREDLIBM=libz.so.1 LIBS=$(STATICLIB) $(SHAREDLIBV) @@ -282,10 +282,10 @@ placebo $(SHAREDLIBV): $(PIC_OBJS) libz.a -@rmdir objs example$(EXE): example.o $(STATICLIB) - $(CC) $(CFLAGS) -o $@ example.o $(TEST_LDFLAGS) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ example.o $(TEST_LIBS) minigzip$(EXE): minigzip.o $(STATICLIB) - $(CC) $(CFLAGS) -o $@ minigzip.o $(TEST_LDFLAGS) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ minigzip.o $(TEST_LIBS) examplesh$(EXE): example.o $(SHAREDLIBV) $(CC) $(CFLAGS) -o $@ example.o $(LDFLAGS) -L. $(SHAREDLIBV) @@ -294,10 +294,10 @@ minigzipsh$(EXE): minigzip.o $(SHAREDLIBV) $(CC) $(CFLAGS) -o $@ minigzip.o $(LDFLAGS) -L. $(SHAREDLIBV) example64$(EXE): example64.o $(STATICLIB) - $(CC) $(CFLAGS) -o $@ example64.o $(TEST_LDFLAGS) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ example64.o $(TEST_LIBS) minigzip64$(EXE): minigzip64.o $(STATICLIB) - $(CC) $(CFLAGS) -o $@ minigzip64.o $(TEST_LDFLAGS) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ minigzip64.o $(TEST_LIBS) install-libs: $(LIBS) -@if [ ! -d $(DESTDIR)$(exec_prefix) ]; then mkdir -p $(DESTDIR)$(exec_prefix); fi @@ -360,7 +360,7 @@ zconf: $(SRCDIR)zconf.h.in cp -p $(SRCDIR)zconf.h.in zconf.h minizip-test: static - cd contrib/minizip && { CFLAGS="$(CFLAGS)" $(MAKE) test ; cd ../.. ; } + cd contrib/minizip && { CC="$(CC)" CFLAGS="$(CFLAGS)" $(MAKE) test ; cd ../.. ; } minizip-clean: cd contrib/minizip && { $(MAKE) clean ; cd ../.. ; } diff --git a/third-party/zlib/README b/third-party/zlib/README index e02fc5aa..c5f91754 100644 --- a/third-party/zlib/README +++ b/third-party/zlib/README @@ -1,6 +1,6 @@ ZLIB DATA COMPRESSION LIBRARY -zlib 1.3 is a general purpose data compression library. All the code is +zlib 1.3.1 is a general purpose data compression library. All the code is thread safe. The data format used by the zlib library is described by RFCs (Request for Comments) 1950 to 1952 in the files http://tools.ietf.org/html/rfc1950 (zlib format), rfc1951 (deflate format) and @@ -31,7 +31,7 @@ Mark Nelson wrote an article about zlib for the Jan. 1997 issue of Dr. Dobb's Journal; a copy of the article is available at https://marknelson.us/posts/1997/01/01/zlib-engine.html . -The changes made in version 1.3 are documented in the file ChangeLog. +The changes made in version 1.3.1 are documented in the file ChangeLog. Unsupported third party contributions are provided in directory contrib/ . @@ -83,7 +83,7 @@ Acknowledgments: Copyright notice: - (C) 1995-2023 Jean-loup Gailly and Mark Adler + (C) 1995-2024 Jean-loup Gailly and Mark Adler This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages diff --git a/third-party/zlib/configure b/third-party/zlib/configure index cc867c94..c55098af 100755 --- a/third-party/zlib/configure +++ b/third-party/zlib/configure @@ -25,7 +25,7 @@ if test $SRCDIR = "."; then ZINCOUT="-I." SRCDIR="" else - ZINC='-include zconf.h' + ZINC='-I. -include zconf.h' ZINCOUT='-I. -I$(SRCDIR)' SRCDIR="$SRCDIR/" fi @@ -44,7 +44,8 @@ STATICLIB=libz.a # extract zlib version numbers from zlib.h VER=`sed -n -e '/VERSION "/s/.*"\(.*\)".*/\1/p' < ${SRCDIR}zlib.h` -VER1=`sed -n -e '/VERSION "/s/.*"\([0-9]*\)\\..*/\1/p' < ${SRCDIR}zlib.h` +VER3=`echo ${VER}|sed -n -e 's/\([0-9]\{1,\}\(\\.[0-9]\{1,\}\)\{1,2\}\).*/\1/p'` +VER1=`echo ${VER}|sed -n -e 's/\([0-9]\{1,\}\)\\..*/\1/p'` # establish commands for library building if "${CROSS_PREFIX}ar" --version >/dev/null 2>/dev/null || test $? -lt 126; then @@ -263,7 +264,7 @@ if test "$gcc" -eq 1 && ($cc -c $test.c) >> configure.log 2>&1; then SHAREDLIB=libz$shared_ext SHAREDLIBV=libz.$VER$shared_ext SHAREDLIBM=libz.$VER1$shared_ext - LDSHARED=${LDSHARED-"$cc -dynamiclib -install_name $libdir/$SHAREDLIBM -compatibility_version $VER1 -current_version $VER"} + LDSHARED=${LDSHARED-"$cc -dynamiclib -install_name $libdir/$SHAREDLIBM -compatibility_version $VER1 -current_version $VER3"} if "${CROSS_PREFIX}libtool" -V 2>&1 | grep Apple > /dev/null; then AR="${CROSS_PREFIX}libtool" elif libtool -V 2>&1 | grep Apple > /dev/null; then @@ -441,7 +442,7 @@ EOF if test $shared -eq 1; then echo Checking for shared library support... | tee -a configure.log # we must test in two steps (cc then ld), required at least on SunOS 4.x - if try $CC -w -c $SFLAGS $test.c && + if try $CC -c $SFLAGS $test.c && try $LDSHARED $SFLAGS -o $test$shared_ext $test.o; then echo Building shared library $SHAREDLIBV with $CC. | tee -a configure.log elif test -z "$old_cc" -a -z "$old_cflags"; then diff --git a/third-party/zlib/contrib/delphi/ZLib.pas b/third-party/zlib/contrib/delphi/ZLib.pas index 814ffa67..93fa4c9e 100644 --- a/third-party/zlib/contrib/delphi/ZLib.pas +++ b/third-party/zlib/contrib/delphi/ZLib.pas @@ -152,7 +152,7 @@ procedure DecompressToUserBuf(const InBuf: Pointer; InBytes: Integer; const OutBuf: Pointer; BufSize: Integer); const - zlib_version = '1.3.0'; + zlib_version = '1.3.1'; type EZlibError = class(Exception); diff --git a/third-party/zlib/contrib/infback9/inftree9.c b/third-party/zlib/contrib/infback9/inftree9.c index dc38f24d..ac707ed3 100644 --- a/third-party/zlib/contrib/infback9/inftree9.c +++ b/third-party/zlib/contrib/infback9/inftree9.c @@ -1,5 +1,5 @@ /* inftree9.c -- generate Huffman trees for efficient decoding - * Copyright (C) 1995-2023 Mark Adler + * Copyright (C) 1995-2024 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -9,7 +9,7 @@ #define MAXBITS 15 const char inflate9_copyright[] = - " inflate9 1.3 Copyright 1995-2023 Mark Adler "; + " inflate9 1.3.1 Copyright 1995-2024 Mark Adler "; /* If you use the zlib library in a product, an acknowledgment is welcome in the documentation of your product. If for some reason you cannot @@ -59,7 +59,7 @@ int inflate_table9(codetype type, unsigned short FAR *lens, unsigned codes, static const unsigned short lext[31] = { /* Length codes 257..285 extra */ 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 130, 130, 130, 130, 131, 131, 131, 131, 132, 132, 132, 132, - 133, 133, 133, 133, 144, 198, 203}; + 133, 133, 133, 133, 144, 203, 77}; static const unsigned short dbase[32] = { /* Distance codes 0..31 base */ 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, diff --git a/third-party/zlib/contrib/infback9/inftree9.h b/third-party/zlib/contrib/infback9/inftree9.h index 2c1252f5..ab2ea28b 100644 --- a/third-party/zlib/contrib/infback9/inftree9.h +++ b/third-party/zlib/contrib/infback9/inftree9.h @@ -41,8 +41,8 @@ typedef struct { examples/enough.c found in the zlib distribution. The arguments to that program are the number of symbols, the initial root table size, and the maximum bit length of a code. "enough 286 9 15" for literal/length codes - returns returns 852, and "enough 32 6 15" for distance codes returns 594. - The initial root table size (9 or 6) is found in the fifth argument of the + returns 852, and "enough 32 6 15" for distance codes returns 594. The + initial root table size (9 or 6) is found in the fifth argument of the inflate_table() calls in infback9.c. If the root table size is changed, then these maximum sizes would be need to be recalculated and updated. */ #define ENOUGH_LENS 852 diff --git a/third-party/zlib/contrib/iostream3/zfstream.h b/third-party/zlib/contrib/iostream3/zfstream.h index 8574479a..3dabc0f9 100644 --- a/third-party/zlib/contrib/iostream3/zfstream.h +++ b/third-party/zlib/contrib/iostream3/zfstream.h @@ -413,7 +413,7 @@ template class gzomanip2 { public: - // Allows insertor to peek at internals + // Allows inserter to peek at internals template friend gzofstream& operator<<(gzofstream&, @@ -452,7 +452,7 @@ template : func(f), val1(v1), val2(v2) { } -// Insertor applies underlying manipulator function to stream +// Inserter applies underlying manipulator function to stream template inline gzofstream& operator<<(gzofstream& s, const gzomanip2& m) diff --git a/third-party/zlib/contrib/minizip/Makefile b/third-party/zlib/contrib/minizip/Makefile index aac76e07..3d927ec1 100644 --- a/third-party/zlib/contrib/minizip/Makefile +++ b/third-party/zlib/contrib/minizip/Makefile @@ -1,4 +1,4 @@ -CC=cc +CC?=cc CFLAGS := $(CFLAGS) -O -I../.. UNZ_OBJS = miniunz.o unzip.o ioapi.o ../../libz.a diff --git a/third-party/zlib/contrib/minizip/configure.ac b/third-party/zlib/contrib/minizip/configure.ac index df80e5b7..15ec9171 100644 --- a/third-party/zlib/contrib/minizip/configure.ac +++ b/third-party/zlib/contrib/minizip/configure.ac @@ -1,7 +1,7 @@ # -*- Autoconf -*- # Process this file with autoconf to produce a configure script. -AC_INIT([minizip], [1.3.0], [bugzilla.redhat.com]) +AC_INIT([minizip], [1.3.1], [bugzilla.redhat.com]) AC_CONFIG_SRCDIR([minizip.c]) AM_INIT_AUTOMAKE([foreign]) LT_INIT diff --git a/third-party/zlib/contrib/minizip/ioapi.h b/third-party/zlib/contrib/minizip/ioapi.h index c588a18d..a2d2e6e6 100644 --- a/third-party/zlib/contrib/minizip/ioapi.h +++ b/third-party/zlib/contrib/minizip/ioapi.h @@ -144,7 +144,7 @@ typedef long (ZCALLBACK *tell_file_func) (voidpf opaque, voidpf stream) typedef long (ZCALLBACK *seek_file_func) (voidpf opaque, voidpf stream, uLong offset, int origin); -/* here is the "old" 32 bits structure structure */ +/* here is the "old" 32 bits structure */ typedef struct zlib_filefunc_def_s { open_file_func zopen_file; diff --git a/third-party/zlib/contrib/minizip/miniunz.c b/third-party/zlib/contrib/minizip/miniunz.c index a12aec8b..d627c422 100644 --- a/third-party/zlib/contrib/minizip/miniunz.c +++ b/third-party/zlib/contrib/minizip/miniunz.c @@ -79,7 +79,7 @@ /* change_file_date : change the date/time of a file filename : the filename of the file where date/time must be modified - dosdate : the new date at the MSDos format (4 bytes) + dosdate : the new date at the MSDOS format (4 bytes) tmu_date : the SAME new date at the tm_unz format */ static void change_file_date(const char *filename, uLong dosdate, tm_unz tmu_date) { #ifdef _WIN32 @@ -186,7 +186,7 @@ static int makedir(const char *newdir) { } static void do_banner(void) { - printf("MiniUnz 1.01b, demo of zLib + Unz package written by Gilles Vollant\n"); + printf("MiniUnz 1.1, demo of zLib + Unz package written by Gilles Vollant\n"); printf("more info at http://www.winimage.com/zLibDll/unzip.html\n\n"); } @@ -356,6 +356,20 @@ static int do_extract_currentfile(unzFile uf, const int* popt_extract_without_pa else write_filename = filename_withoutpath; + if (write_filename[0]!='\0') + { + const char* relative_check = write_filename; + while (relative_check[1]!='\0') + { + if (relative_check[0]=='.' && relative_check[1]=='.') + write_filename = relative_check; + relative_check++; + } + } + + while (write_filename[0]=='/' || write_filename[0]=='.') + write_filename++; + err = unzOpenCurrentFilePassword(uf,password); if (err!=UNZ_OK) { diff --git a/third-party/zlib/contrib/minizip/unzip.c b/third-party/zlib/contrib/minizip/unzip.c index ed763f89..ea05b7d6 100644 --- a/third-party/zlib/contrib/minizip/unzip.c +++ b/third-party/zlib/contrib/minizip/unzip.c @@ -117,7 +117,7 @@ const char unz_copyright[] = " unzip 1.01 Copyright 1998-2004 Gilles Vollant - http://www.winimage.com/zLibDll"; -/* unz_file_info_interntal contain internal info about a file in zipfile*/ +/* unz_file_info64_internal contain internal info about a file in zipfile*/ typedef struct unz_file_info64_internal_s { ZPOS64_T offset_curfile;/* relative offset of local header 8 bytes */ @@ -450,7 +450,7 @@ local ZPOS64_T unz64local_SearchCentralDir64(const zlib_filefunc64_32_def* pzlib if (unz64local_getLong(pzlib_filefunc_def,filestream,&uL)!=UNZ_OK) return CENTRALDIRINVALID; - /* number of the disk with the start of the zip64 end of central directory */ + /* number of the disk with the start of the zip64 end of central directory */ if (unz64local_getLong(pzlib_filefunc_def,filestream,&uL)!=UNZ_OK) return CENTRALDIRINVALID; if (uL != 0) @@ -497,9 +497,9 @@ local unzFile unzOpenInternal(const void *path, ZPOS64_T central_pos; uLong uL; - uLong number_disk; /* number of the current dist, used for + uLong number_disk; /* number of the current disk, used for spanning ZIP, unsupported, always 0*/ - uLong number_disk_with_CD; /* number the the disk with central dir, used + uLong number_disk_with_CD; /* number the disk with central dir, used for spanning ZIP, unsupported, always 0*/ ZPOS64_T number_entry_CD; /* total number of entries in the central dir diff --git a/third-party/zlib/contrib/minizip/unzip.h b/third-party/zlib/contrib/minizip/unzip.h index 14105840..5cfc9c62 100644 --- a/third-party/zlib/contrib/minizip/unzip.h +++ b/third-party/zlib/contrib/minizip/unzip.h @@ -306,7 +306,7 @@ extern int ZEXPORT unzGetCurrentFileInfo(unzFile file, Get Info about the current file if pfile_info!=NULL, the *pfile_info structure will contain some info about the current file - if szFileName!=NULL, the filemane string will be copied in szFileName + if szFileName!=NULL, the filename string will be copied in szFileName (fileNameBufferSize is the size of the buffer) if extraField!=NULL, the extra field information will be copied in extraField (extraFieldBufferSize is the size of the buffer). diff --git a/third-party/zlib/contrib/minizip/zip.c b/third-party/zlib/contrib/minizip/zip.c index 3d3d4cad..60bdffac 100644 --- a/third-party/zlib/contrib/minizip/zip.c +++ b/third-party/zlib/contrib/minizip/zip.c @@ -575,7 +575,7 @@ local ZPOS64_T zip64local_SearchCentralDir64(const zlib_filefunc64_32_def* pzlib if (zip64local_getLong(pzlib_filefunc_def,filestream,&uL)!=ZIP_OK) return 0; - /* number of the disk with the start of the zip64 end of central directory */ + /* number of the disk with the start of the zip64 end of central directory */ if (zip64local_getLong(pzlib_filefunc_def,filestream,&uL)!=ZIP_OK) return 0; if (uL != 0) @@ -614,9 +614,9 @@ local int LoadCentralDirectoryRecord(zip64_internal* pziinit) { ZPOS64_T central_pos; uLong uL; - uLong number_disk; /* number of the current dist, used for + uLong number_disk; /* number of the current disk, used for spanning ZIP, unsupported, always 0*/ - uLong number_disk_with_CD; /* number the the disk with central dir, used + uLong number_disk_with_CD; /* number of the disk with central dir, used for spanning ZIP, unsupported, always 0*/ ZPOS64_T number_entry; ZPOS64_T number_entry_CD; /* total number of entries in @@ -1043,6 +1043,17 @@ extern int ZEXPORT zipOpenNewFileInZip4_64(zipFile file, const char* filename, c return ZIP_PARAMERROR; #endif + // The filename and comment length must fit in 16 bits. + if ((filename!=NULL) && (strlen(filename)>0xffff)) + return ZIP_PARAMERROR; + if ((comment!=NULL) && (strlen(comment)>0xffff)) + return ZIP_PARAMERROR; + // The extra field length must fit in 16 bits. If the member also requires + // a Zip64 extra block, that will also need to fit within that 16-bit + // length, but that will be checked for later. + if ((size_extrafield_local>0xffff) || (size_extrafield_global>0xffff)) + return ZIP_PARAMERROR; + zi = (zip64_internal*)file; if (zi->in_opened_file_inzip == 1) @@ -1597,7 +1608,7 @@ extern int ZEXPORT zipCloseFileInZipRaw64(zipFile file, ZPOS64_T uncompressed_si if((uLong)(datasize + 4) > zi->ci.size_centralExtraFree) { - // we can not write more data to the buffer that we have room for. + // we cannot write more data to the buffer that we have room for. return ZIP_BADZIPFILE; } @@ -1861,7 +1872,7 @@ extern int ZEXPORT zipClose(zipFile file, const char* global_comment) { free_linkedlist(&(zi->central_dir)); pos = centraldir_pos_inzip - zi->add_position_when_writing_offset; - if(pos >= 0xffffffff || zi->number_entry > 0xFFFF) + if(pos >= 0xffffffff || zi->number_entry >= 0xFFFF) { ZPOS64_T Zip64EOCDpos = ZTELL64(zi->z_filefunc,zi->filestream); Write_Zip64EndOfCentralDirectoryRecord(zi, size_centraldir, centraldir_pos_inzip); diff --git a/third-party/zlib/contrib/minizip/zip.h b/third-party/zlib/contrib/minizip/zip.h index 5fc08413..3e230d34 100644 --- a/third-party/zlib/contrib/minizip/zip.h +++ b/third-party/zlib/contrib/minizip/zip.h @@ -177,9 +177,9 @@ extern int ZEXPORT zipOpenNewFileInZip64(zipFile file, filename : the filename in zip (if NULL, '-' without quote will be used *zipfi contain supplemental information if extrafield_local!=NULL and size_extrafield_local>0, extrafield_local - contains the extrafield data the the local header + contains the extrafield data for the local header if extrafield_global!=NULL and size_extrafield_global>0, extrafield_global - contains the extrafield data the the local header + contains the extrafield data for the global header if comment != NULL, comment contain the comment string method contain the compression method (0 for store, Z_DEFLATED for deflate) level contain the level of compression (can be Z_DEFAULT_COMPRESSION) diff --git a/third-party/zlib/contrib/nuget/nuget.csproj b/third-party/zlib/contrib/nuget/nuget.csproj new file mode 100644 index 00000000..68627f03 --- /dev/null +++ b/third-party/zlib/contrib/nuget/nuget.csproj @@ -0,0 +1,43 @@ + + + + net6.0 + madler.zlib.redist + $(PackageId).win + $(PackageId).linux + $(PackageId).osx + (C) 1995-2024 Jean-loup Gailly and Mark Adler + 1.3.1 + NuGet Package for consuming native builds of zlib into .NET without complexity. + + NU5128 + $(MSBuildProjectDirectory) + Jean-loup Gailly and Mark Adler + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/third-party/zlib/contrib/nuget/nuget.sln b/third-party/zlib/contrib/nuget/nuget.sln new file mode 100644 index 00000000..46ee8dea --- /dev/null +++ b/third-party/zlib/contrib/nuget/nuget.sln @@ -0,0 +1,22 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.0.31903.59 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "nuget", "nuget.csproj", "{B1BD3984-EF8F-4E9D-9A94-EB784E5EB1E8}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {B1BD3984-EF8F-4E9D-9A94-EB784E5EB1E8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B1BD3984-EF8F-4E9D-9A94-EB784E5EB1E8}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B1BD3984-EF8F-4E9D-9A94-EB784E5EB1E8}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B1BD3984-EF8F-4E9D-9A94-EB784E5EB1E8}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection +EndGlobal diff --git a/third-party/zlib/contrib/pascal/zlibpas.pas b/third-party/zlib/contrib/pascal/zlibpas.pas index a2b24a59..0cf0e7b8 100644 --- a/third-party/zlib/contrib/pascal/zlibpas.pas +++ b/third-party/zlib/contrib/pascal/zlibpas.pas @@ -10,7 +10,7 @@ interface const - ZLIB_VERSION = '1.3.0'; + ZLIB_VERSION = '1.3.1'; ZLIB_VERNUM = $12a0; type diff --git a/third-party/zlib/contrib/puff/puff.c b/third-party/zlib/contrib/puff/puff.c index 6737ff61..d759825a 100644 --- a/third-party/zlib/contrib/puff/puff.c +++ b/third-party/zlib/contrib/puff/puff.c @@ -593,10 +593,10 @@ local int fixed(struct state *s) * provided for each of the literal/length symbols, and for each of the * distance symbols. * - * - If a symbol is not used in the block, this is represented by a zero as - * as the code length. This does not mean a zero-length code, but rather - * that no code should be created for this symbol. There is no way in the - * deflate format to represent a zero-length code. + * - If a symbol is not used in the block, this is represented by a zero as the + * code length. This does not mean a zero-length code, but rather that no + * code should be created for this symbol. There is no way in the deflate + * format to represent a zero-length code. * * - The maximum number of bits in a code is 15, so the possible lengths for * any code are 1..15. diff --git a/third-party/zlib/contrib/vstudio/readme.txt b/third-party/zlib/contrib/vstudio/readme.txt index 05ba487c..061bbc0e 100644 --- a/third-party/zlib/contrib/vstudio/readme.txt +++ b/third-party/zlib/contrib/vstudio/readme.txt @@ -1,75 +1,81 @@ -Building instructions for the DLL versions of Zlib 1.3.0 -======================================================== - -This directory contains projects that build zlib and minizip using -Microsoft Visual C++ 9.0/10.0. - -You don't need to build these projects yourself. You can download the -binaries from: - http://www.winimage.com/zLibDll - -More information can be found at this site. - - - - - -Build instructions for Visual Studio 2008 (32 bits or 64 bits) --------------------------------------------------------------- -- Decompress current zlib, including all contrib/* files -- Open contrib\vstudio\vc9\zlibvc.sln with Microsoft Visual C++ 2008 -- Or run: vcbuild /rebuild contrib\vstudio\vc9\zlibvc.sln "Release|Win32" - -Build instructions for Visual Studio 2010 (32 bits or 64 bits) --------------------------------------------------------------- -- Decompress current zlib, including all contrib/* files -- Open contrib\vstudio\vc10\zlibvc.sln with Microsoft Visual C++ 2010 - -Build instructions for Visual Studio 2012 (32 bits or 64 bits) --------------------------------------------------------------- -- Decompress current zlib, including all contrib/* files -- Open contrib\vstudio\vc11\zlibvc.sln with Microsoft Visual C++ 2012 - -Build instructions for Visual Studio 2013 (32 bits or 64 bits) --------------------------------------------------------------- -- Decompress current zlib, including all contrib/* files -- Open contrib\vstudio\vc12\zlibvc.sln with Microsoft Visual C++ 2013 - -Build instructions for Visual Studio 2015 (32 bits or 64 bits) --------------------------------------------------------------- -- Decompress current zlib, including all contrib/* files -- Open contrib\vstudio\vc14\zlibvc.sln with Microsoft Visual C++ 2015 - - -Important ---------- -- To use zlibwapi.dll in your application, you must define the - macro ZLIB_WINAPI when compiling your application's source files. - - -Additional notes ----------------- -- This DLL, named zlibwapi.dll, is compatible to the old zlib.dll built - by Gilles Vollant from the zlib 1.1.x sources, and distributed at - http://www.winimage.com/zLibDll - It uses the WINAPI calling convention for the exported functions, and - includes the minizip functionality. If your application needs that - particular build of zlib.dll, you can rename zlibwapi.dll to zlib.dll. - -- The new DLL was renamed because there exist several incompatible - versions of zlib.dll on the Internet. - -- There is also an official DLL build of zlib, named zlib1.dll. This one - is exporting the functions using the CDECL convention. See the file - win32\DLL_FAQ.txt found in this zlib distribution. - -- There used to be a ZLIB_DLL macro in zlib 1.1.x, but now this symbol - has a slightly different effect. To avoid compatibility problems, do - not define it here. - - -Gilles Vollant -info@winimage.com - -Visual Studio 2013 and 2015 Projects from Sean Hunt -seandhunt_7@yahoo.com +Building instructions for the DLL versions of Zlib 1.3.1 +======================================================== + +This directory contains projects that build zlib and minizip using +Microsoft Visual C++ 9.0/10.0. + +You don't need to build these projects yourself. You can download the +binaries from: + http://www.winimage.com/zLibDll + +More information can be found at this site. + + + + + +Build instructions for Visual Studio 2008 (32 bits or 64 bits) +-------------------------------------------------------------- +- Decompress current zlib, including all contrib/* files +- Open contrib\vstudio\vc9\zlibvc.sln with Microsoft Visual C++ 2008 +- Or run: vcbuild /rebuild contrib\vstudio\vc9\zlibvc.sln "Release|Win32" + +Build instructions for Visual Studio 2010 (32 bits or 64 bits) +-------------------------------------------------------------- +- Decompress current zlib, including all contrib/* files +- Open contrib\vstudio\vc10\zlibvc.sln with Microsoft Visual C++ 2010 + +Build instructions for Visual Studio 2012 (32 bits or 64 bits) +-------------------------------------------------------------- +- Decompress current zlib, including all contrib/* files +- Open contrib\vstudio\vc11\zlibvc.sln with Microsoft Visual C++ 2012 + +Build instructions for Visual Studio 2013 (32 bits or 64 bits) +-------------------------------------------------------------- +- Decompress current zlib, including all contrib/* files +- Open contrib\vstudio\vc12\zlibvc.sln with Microsoft Visual C++ 2013 + +Build instructions for Visual Studio 2015 (32 bits or 64 bits) +-------------------------------------------------------------- +- Decompress current zlib, including all contrib/* files +- Open contrib\vstudio\vc14\zlibvc.sln with Microsoft Visual C++ 2015 + +Build instructions for Visual Studio 2022 (64 bits) +-------------------------------------------------------------- +- Decompress current zlib, including all contrib/* files +- Open contrib\vstudio\vc143\zlibvc.sln with Microsoft Visual C++ 2022 + + + +Important +--------- +- To use zlibwapi.dll in your application, you must define the + macro ZLIB_WINAPI when compiling your application's source files. + + +Additional notes +---------------- +- This DLL, named zlibwapi.dll, is compatible to the old zlib.dll built + by Gilles Vollant from the zlib 1.1.x sources, and distributed at + http://www.winimage.com/zLibDll + It uses the WINAPI calling convention for the exported functions, and + includes the minizip functionality. If your application needs that + particular build of zlib.dll, you can rename zlibwapi.dll to zlib.dll. + +- The new DLL was renamed because there exist several incompatible + versions of zlib.dll on the Internet. + +- There is also an official DLL build of zlib, named zlib1.dll. This one + is exporting the functions using the CDECL convention. See the file + win32\DLL_FAQ.txt found in this zlib distribution. + +- There used to be a ZLIB_DLL macro in zlib 1.1.x, but now this symbol + has a slightly different effect. To avoid compatibility problems, do + not define it here. + + +Gilles Vollant +info@winimage.com + +Visual Studio 2013, 2015, and 2022 Projects from Sean Hunt +seandhunt_7@yahoo.com diff --git a/third-party/zlib/contrib/vstudio/vc10/zlib.rc b/third-party/zlib/contrib/vstudio/vc10/zlib.rc index 29af8e11..856bd11f 100644 --- a/third-party/zlib/contrib/vstudio/vc10/zlib.rc +++ b/third-party/zlib/contrib/vstudio/vc10/zlib.rc @@ -2,8 +2,8 @@ #define IDR_VERSION1 1 IDR_VERSION1 VERSIONINFO MOVEABLE IMPURE LOADONCALL DISCARDABLE - FILEVERSION 1, 3, 0, 0 - PRODUCTVERSION 1, 3, 0, 0 + FILEVERSION 1, 3, 1, 0 + PRODUCTVERSION 1, 3, 1, 0 FILEFLAGSMASK VS_FFI_FILEFLAGSMASK FILEFLAGS 0 FILEOS VOS_DOS_WINDOWS32 @@ -17,12 +17,12 @@ BEGIN BEGIN VALUE "FileDescription", "zlib data compression and ZIP file I/O library\0" - VALUE "FileVersion", "1.3.0\0" + VALUE "FileVersion", "1.3.1\0" VALUE "InternalName", "zlib\0" VALUE "OriginalFilename", "zlibwapi.dll\0" VALUE "ProductName", "ZLib.DLL\0" VALUE "Comments","DLL support by Alessandro Iacopetti & Gilles Vollant\0" - VALUE "LegalCopyright", "(C) 1995-2023 Jean-loup Gailly & Mark Adler\0" + VALUE "LegalCopyright", "(C) 1995-2024 Jean-loup Gailly & Mark Adler\0" END END BLOCK "VarFileInfo" diff --git a/third-party/zlib/contrib/vstudio/vc10/zlibvc.def b/third-party/zlib/contrib/vstudio/vc10/zlibvc.def index f28aa6c7..3234a02d 100644 --- a/third-party/zlib/contrib/vstudio/vc10/zlibvc.def +++ b/third-party/zlib/contrib/vstudio/vc10/zlibvc.def @@ -1,7 +1,7 @@ LIBRARY ; zlib data compression and ZIP file I/O library -VERSION 1.3 +VERSION 1.3.1 EXPORTS adler32 @1 diff --git a/third-party/zlib/contrib/vstudio/vc11/zlib.rc b/third-party/zlib/contrib/vstudio/vc11/zlib.rc index 29af8e11..856bd11f 100644 --- a/third-party/zlib/contrib/vstudio/vc11/zlib.rc +++ b/third-party/zlib/contrib/vstudio/vc11/zlib.rc @@ -2,8 +2,8 @@ #define IDR_VERSION1 1 IDR_VERSION1 VERSIONINFO MOVEABLE IMPURE LOADONCALL DISCARDABLE - FILEVERSION 1, 3, 0, 0 - PRODUCTVERSION 1, 3, 0, 0 + FILEVERSION 1, 3, 1, 0 + PRODUCTVERSION 1, 3, 1, 0 FILEFLAGSMASK VS_FFI_FILEFLAGSMASK FILEFLAGS 0 FILEOS VOS_DOS_WINDOWS32 @@ -17,12 +17,12 @@ BEGIN BEGIN VALUE "FileDescription", "zlib data compression and ZIP file I/O library\0" - VALUE "FileVersion", "1.3.0\0" + VALUE "FileVersion", "1.3.1\0" VALUE "InternalName", "zlib\0" VALUE "OriginalFilename", "zlibwapi.dll\0" VALUE "ProductName", "ZLib.DLL\0" VALUE "Comments","DLL support by Alessandro Iacopetti & Gilles Vollant\0" - VALUE "LegalCopyright", "(C) 1995-2023 Jean-loup Gailly & Mark Adler\0" + VALUE "LegalCopyright", "(C) 1995-2024 Jean-loup Gailly & Mark Adler\0" END END BLOCK "VarFileInfo" diff --git a/third-party/zlib/contrib/vstudio/vc11/zlibvc.def b/third-party/zlib/contrib/vstudio/vc11/zlibvc.def index f28aa6c7..3234a02d 100644 --- a/third-party/zlib/contrib/vstudio/vc11/zlibvc.def +++ b/third-party/zlib/contrib/vstudio/vc11/zlibvc.def @@ -1,7 +1,7 @@ LIBRARY ; zlib data compression and ZIP file I/O library -VERSION 1.3 +VERSION 1.3.1 EXPORTS adler32 @1 diff --git a/third-party/zlib/contrib/vstudio/vc12/zlib.rc b/third-party/zlib/contrib/vstudio/vc12/zlib.rc index 57fb31a3..a55f341c 100644 --- a/third-party/zlib/contrib/vstudio/vc12/zlib.rc +++ b/third-party/zlib/contrib/vstudio/vc12/zlib.rc @@ -2,8 +2,8 @@ #define IDR_VERSION1 1 IDR_VERSION1 VERSIONINFO MOVEABLE IMPURE LOADONCALL DISCARDABLE - FILEVERSION 1, 3, 0, 0 - PRODUCTVERSION 1, 3, 0, 0 + FILEVERSION 1, 3, 1, 0 + PRODUCTVERSION 1, 3, 1, 0 FILEFLAGSMASK VS_FFI_FILEFLAGSMASK FILEFLAGS 0 FILEOS VOS_DOS_WINDOWS32 @@ -17,12 +17,12 @@ BEGIN BEGIN VALUE "FileDescription", "zlib data compression and ZIP file I/O library\0" - VALUE "FileVersion", "1.3.0\0" + VALUE "FileVersion", "1.3.1\0" VALUE "InternalName", "zlib\0" VALUE "OriginalFilename", "zlibwapi.dll\0" VALUE "ProductName", "ZLib.DLL\0" VALUE "Comments","DLL support by Alessandro Iacopetti & Gilles Vollant\0" - VALUE "LegalCopyright", "(C) 1995-2023 Jean-loup Gailly & Mark Adler\0" + VALUE "LegalCopyright", "(C) 1995-2024 Jean-loup Gailly & Mark Adler\0" END END BLOCK "VarFileInfo" diff --git a/third-party/zlib/contrib/vstudio/vc12/zlibvc.def b/third-party/zlib/contrib/vstudio/vc12/zlibvc.def index f28aa6c7..3234a02d 100644 --- a/third-party/zlib/contrib/vstudio/vc12/zlibvc.def +++ b/third-party/zlib/contrib/vstudio/vc12/zlibvc.def @@ -1,7 +1,7 @@ LIBRARY ; zlib data compression and ZIP file I/O library -VERSION 1.3 +VERSION 1.3.1 EXPORTS adler32 @1 diff --git a/third-party/zlib/contrib/vstudio/vc14/zlib.rc b/third-party/zlib/contrib/vstudio/vc14/zlib.rc index 57fb31a3..a55f341c 100644 --- a/third-party/zlib/contrib/vstudio/vc14/zlib.rc +++ b/third-party/zlib/contrib/vstudio/vc14/zlib.rc @@ -2,8 +2,8 @@ #define IDR_VERSION1 1 IDR_VERSION1 VERSIONINFO MOVEABLE IMPURE LOADONCALL DISCARDABLE - FILEVERSION 1, 3, 0, 0 - PRODUCTVERSION 1, 3, 0, 0 + FILEVERSION 1, 3, 1, 0 + PRODUCTVERSION 1, 3, 1, 0 FILEFLAGSMASK VS_FFI_FILEFLAGSMASK FILEFLAGS 0 FILEOS VOS_DOS_WINDOWS32 @@ -17,12 +17,12 @@ BEGIN BEGIN VALUE "FileDescription", "zlib data compression and ZIP file I/O library\0" - VALUE "FileVersion", "1.3.0\0" + VALUE "FileVersion", "1.3.1\0" VALUE "InternalName", "zlib\0" VALUE "OriginalFilename", "zlibwapi.dll\0" VALUE "ProductName", "ZLib.DLL\0" VALUE "Comments","DLL support by Alessandro Iacopetti & Gilles Vollant\0" - VALUE "LegalCopyright", "(C) 1995-2023 Jean-loup Gailly & Mark Adler\0" + VALUE "LegalCopyright", "(C) 1995-2024 Jean-loup Gailly & Mark Adler\0" END END BLOCK "VarFileInfo" diff --git a/third-party/zlib/contrib/vstudio/vc14/zlibvc.def b/third-party/zlib/contrib/vstudio/vc14/zlibvc.def index f28aa6c7..3234a02d 100644 --- a/third-party/zlib/contrib/vstudio/vc14/zlibvc.def +++ b/third-party/zlib/contrib/vstudio/vc14/zlibvc.def @@ -1,7 +1,7 @@ LIBRARY ; zlib data compression and ZIP file I/O library -VERSION 1.3 +VERSION 1.3.1 EXPORTS adler32 @1 diff --git a/third-party/zlib/contrib/vstudio/vc17/miniunz.vcxproj b/third-party/zlib/contrib/vstudio/vc17/miniunz.vcxproj new file mode 100644 index 00000000..68ef1658 --- /dev/null +++ b/third-party/zlib/contrib/vstudio/vc17/miniunz.vcxproj @@ -0,0 +1,409 @@ + + + + + Debug + ARM + + + Debug + ARM64 + + + Debug + Win32 + + + Debug + x64 + + + Release + ARM + + + Release + ARM64 + + + Release + Win32 + + + Release + x64 + + + + {C52F9E7B-498A-42BE-8DB4-85A15694382A} + Win32Proj + 10.0 + + + + Application + MultiByte + v143 + + + Application + Unicode + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + x86\MiniUnzip$(Configuration)\ + x86\MiniUnzip$(Configuration)\Tmp\ + true + false + x86\MiniUnzip$(Configuration)\ + x86\MiniUnzip$(Configuration)\Tmp\ + false + false + x64\MiniUnzip$(Configuration)\ + x64\MiniUnzip$(Configuration)\Tmp\ + true + true + true + false + false + false + x64\MiniUnzip$(Configuration)\ + x64\MiniUnzip$(Configuration)\Tmp\ + false + false + false + false + false + false + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + + + arm64\MiniUnzip$(Configuration)\ + arm64\MiniUnzip$(Configuration)\Tmp\ + + + arm64\MiniUnzip$(Configuration)\ + arm64\MiniUnzip$(Configuration)\Tmp\ + + + arm\MiniUnzip$(Configuration)\ + arm\MiniUnzip$(Configuration)\Tmp\ + + + arm\MiniUnzip$(Configuration)\ + arm\MiniUnzip$(Configuration)\Tmp\ + + + + Disabled + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + Default + MultiThreadedDebugDLL + false + + + $(IntDir) + Level3 + ProgramDatabase + + + x86\ZlibDllDebug\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)miniunz.exe + true + $(OutDir)miniunz.pdb + Console + false + + + MachineX86 + + + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + Default + MultiThreaded + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + x86\ZlibDllRelease\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)miniunz.exe + true + Console + true + true + false + + + MachineX86 + + + + + X64 + + + Disabled + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;_DEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDebugDLL + false + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllDebug\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)miniunz.exe + true + $(OutDir)miniunz.pdb + Console + MachineX64 + + + + + + Disabled + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;_DEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDebugDLL + false + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllDebug\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)miniunz.exe + true + $(OutDir)miniunz.pdb + Console + + + + + + Disabled + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;_DEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDebugDLL + false + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllDebug\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)miniunz.exe + true + $(OutDir)miniunz.pdb + Console + + + + + X64 + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;NDEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDLL + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllRelease\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)miniunz.exe + true + Console + true + true + MachineX64 + + + + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;NDEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDLL + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllRelease\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)miniunz.exe + true + Console + true + true + + + + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;NDEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDLL + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllRelease\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)miniunz.exe + true + Console + true + true + + + + + + + + {8fd826f8-3739-44e6-8cc8-997122e53b8d} + + + + + + \ No newline at end of file diff --git a/third-party/zlib/contrib/vstudio/vc17/minizip.vcxproj b/third-party/zlib/contrib/vstudio/vc17/minizip.vcxproj new file mode 100644 index 00000000..dd3c52e7 --- /dev/null +++ b/third-party/zlib/contrib/vstudio/vc17/minizip.vcxproj @@ -0,0 +1,405 @@ + + + + + Debug + ARM + + + Debug + ARM64 + + + Debug + Win32 + + + Debug + x64 + + + Release + ARM + + + Release + ARM64 + + + Release + Win32 + + + Release + x64 + + + + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B} + Win32Proj + 10.0 + + + + Application + MultiByte + v143 + + + Application + Unicode + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + x86\MiniZip$(Configuration)\ + x86\MiniZip$(Configuration)\Tmp\ + true + false + x86\MiniZip$(Configuration)\ + x86\MiniZip$(Configuration)\Tmp\ + false + x64\$(Configuration)\ + x64\$(Configuration)\ + true + true + true + false + false + false + x64\$(Configuration)\ + x64\$(Configuration)\ + false + false + false + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + + + arm64\MiniZip$(Configuration)\ + arm64\MiniZip$(Configuration)\Tmp\ + + + arm64\MiniZip$(Configuration)\ + arm64\MiniZip$(Configuration)\Tmp\ + + + arm\MiniZip$(Configuration)\ + arm\MiniZip$(Configuration)\Tmp\ + + + arm\MiniZip$(Configuration)\ + arm\MiniZip$(Configuration)\Tmp\ + + + + Disabled + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + Default + MultiThreadedDebugDLL + false + + + $(IntDir) + Level3 + ProgramDatabase + + + x86\ZlibDllDebug\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)minizip.exe + true + $(OutDir)minizip.pdb + Console + false + + + MachineX86 + + + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + Default + MultiThreaded + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + x86\ZlibDllRelease\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)minizip.exe + true + Console + true + true + false + + + MachineX86 + + + + + X64 + + + Disabled + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;_DEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDebugDLL + false + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllDebug\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)minizip.exe + true + $(OutDir)minizip.pdb + Console + MachineX64 + + + + + + Disabled + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;_DEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDebugDLL + false + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllDebug\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)minizip.exe + true + $(OutDir)minizip.pdb + Console + + + + + + Disabled + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;_DEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDebugDLL + false + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllDebug\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)minizip.exe + true + $(OutDir)minizip.pdb + Console + + + + + X64 + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;NDEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDLL + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllRelease\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)minizip.exe + true + Console + true + true + MachineX64 + + + + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;NDEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDLL + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllRelease\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)minizip.exe + true + Console + true + true + + + + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;NDEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDLL + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllRelease\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)minizip.exe + true + Console + true + true + + + + + + + + {8fd826f8-3739-44e6-8cc8-997122e53b8d} + + + + + + \ No newline at end of file diff --git a/third-party/zlib/contrib/vstudio/vc17/testzlib.vcxproj b/third-party/zlib/contrib/vstudio/vc17/testzlib.vcxproj new file mode 100644 index 00000000..4cc99b3f --- /dev/null +++ b/third-party/zlib/contrib/vstudio/vc17/testzlib.vcxproj @@ -0,0 +1,473 @@ + + + + + Debug + ARM + + + Debug + ARM64 + + + Debug + Win32 + + + Debug + x64 + + + ReleaseWithoutAsm + ARM + + + ReleaseWithoutAsm + ARM64 + + + ReleaseWithoutAsm + Win32 + + + ReleaseWithoutAsm + x64 + + + Release + ARM + + + Release + ARM64 + + + Release + Win32 + + + Release + x64 + + + + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B} + testzlib + Win32Proj + 10.0 + + + + Application + MultiByte + true + v143 + + + Application + MultiByte + true + v143 + + + Application + Unicode + v143 + + + Application + true + v143 + + + Application + true + v143 + + + Application + true + v143 + + + Application + true + v143 + + + Application + true + v143 + + + Application + true + v143 + + + Application + v143 + + + Application + v143 + + + Application + v143 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + x86\TestZlib$(Configuration)\ + x86\TestZlib$(Configuration)\Tmp\ + true + false + x86\TestZlib$(Configuration)\ + x86\TestZlib$(Configuration)\Tmp\ + false + false + x86\TestZlib$(Configuration)\ + x86\TestZlib$(Configuration)\Tmp\ + false + false + x64\TestZlib$(Configuration)\ + x64\TestZlib$(Configuration)\Tmp\ + false + false + false + x64\TestZlib$(Configuration)\ + x64\TestZlib$(Configuration)\Tmp\ + false + false + false + x64\TestZlib$(Configuration)\ + x64\TestZlib$(Configuration)\Tmp\ + false + false + false + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + + + arm64\TestZlib$(Configuration)\ + arm64\TestZlib$(Configuration)\Tmp\ + + + arm64\TestZlib$(Configuration)\ + arm64\TestZlib$(Configuration)\Tmp\ + + + arm64\TestZlib$(Configuration)\ + arm64\TestZlib$(Configuration)\Tmp\ + + + arm\TestZlib$(Configuration)\ + arm\TestZlib$(Configuration)\Tmp\ + + + arm\TestZlib$(Configuration)\ + arm\TestZlib$(Configuration)\Tmp\ + + + arm\TestZlib$(Configuration)\ + arm\TestZlib$(Configuration)\Tmp\ + + + + Disabled + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;_DEBUG;_CONSOLE;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + true + Default + MultiThreadedDebugDLL + false + + + AssemblyAndSourceCode + $(IntDir) + Level3 + ProgramDatabase + + + %(AdditionalDependencies) + $(OutDir)testzlib.exe + true + $(OutDir)testzlib.pdb + Console + false + + + MachineX86 + + + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;NDEBUG;_CONSOLE;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + true + Default + MultiThreaded + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + $(OutDir)testzlib.exe + true + Console + true + true + false + + + MachineX86 + + + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;NDEBUG;_CONSOLE;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + true + Default + MultiThreaded + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + %(AdditionalDependencies) + $(OutDir)testzlib.exe + true + Console + true + true + false + + + MachineX86 + false + + + + + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;_DEBUG;_CONSOLE;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + Default + MultiThreadedDebugDLL + false + $(IntDir) + + + %(AdditionalDependencies) + + + + + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;_DEBUG;_CONSOLE;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + Default + MultiThreadedDebugDLL + false + $(IntDir) + + + %(AdditionalDependencies) + + + + + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;_DEBUG;_CONSOLE;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + Default + MultiThreadedDebugDLL + false + $(IntDir) + + + %(AdditionalDependencies) + + + + + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;NDEBUG;_CONSOLE;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + Default + MultiThreadedDLL + false + $(IntDir) + + + %(AdditionalDependencies) + + + + + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;NDEBUG;_CONSOLE;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + Default + MultiThreadedDLL + false + $(IntDir) + + + %(AdditionalDependencies) + + + + + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;NDEBUG;_CONSOLE;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + Default + MultiThreadedDLL + false + $(IntDir) + + + %(AdditionalDependencies) + + + + + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;NDEBUG;_CONSOLE;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + Default + MultiThreadedDLL + false + $(IntDir) + + + %(AdditionalDependencies) + + + + + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;NDEBUG;_CONSOLE;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + Default + MultiThreadedDLL + false + $(IntDir) + + + %(AdditionalDependencies) + + + + + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;NDEBUG;_CONSOLE;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + Default + MultiThreadedDLL + false + $(IntDir) + + + %(AdditionalDependencies) + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/third-party/zlib/contrib/vstudio/vc17/testzlibdll.vcxproj b/third-party/zlib/contrib/vstudio/vc17/testzlibdll.vcxproj new file mode 100644 index 00000000..73bba55d --- /dev/null +++ b/third-party/zlib/contrib/vstudio/vc17/testzlibdll.vcxproj @@ -0,0 +1,409 @@ + + + + + Debug + ARM + + + Debug + ARM64 + + + Debug + Win32 + + + Debug + x64 + + + Release + ARM + + + Release + ARM64 + + + Release + Win32 + + + Release + x64 + + + + {C52F9E7B-498A-42BE-8DB4-85A15694366A} + Win32Proj + 10.0 + + + + Application + MultiByte + v143 + + + Application + Unicode + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + Application + MultiByte + v143 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + x86\TestZlibDll$(Configuration)\ + x86\TestZlibDll$(Configuration)\Tmp\ + true + false + x86\TestZlibDll$(Configuration)\ + x86\TestZlibDll$(Configuration)\Tmp\ + false + false + x64\TestZlibDll$(Configuration)\ + x64\TestZlibDll$(Configuration)\Tmp\ + true + true + true + false + false + false + x64\TestZlibDll$(Configuration)\ + x64\TestZlibDll$(Configuration)\Tmp\ + false + false + false + false + false + false + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + + + arm64\TestZlibDll$(Configuration)\ + arm64\TestZlibDll$(Configuration)\Tmp\ + + + arm64\TestZlibDll$(Configuration)\ + arm64\TestZlibDll$(Configuration)\Tmp\ + + + arm\TestZlibDll$(Configuration)\ + arm\TestZlibDll$(Configuration)\Tmp\ + + + arm\TestZlibDll$(Configuration)\ + arm\TestZlibDll$(Configuration)\Tmp\ + + + + Disabled + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + Default + MultiThreadedDebugDLL + false + + + $(IntDir) + Level3 + ProgramDatabase + + + x86\ZlibDllDebug\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)testzlibdll.exe + true + $(OutDir)testzlib.pdb + Console + false + + + MachineX86 + + + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + Default + MultiThreaded + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + x86\ZlibDllRelease\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)testzlibdll.exe + true + Console + true + true + false + + + MachineX86 + + + + + X64 + + + Disabled + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;_DEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDebugDLL + false + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllDebug\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)testzlibdll.exe + true + $(OutDir)testzlib.pdb + Console + MachineX64 + + + + + + Disabled + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;_DEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDebugDLL + false + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllDebug\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)testzlibdll.exe + true + $(OutDir)testzlib.pdb + Console + + + + + + Disabled + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;_DEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDebugDLL + false + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllDebug\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)testzlibdll.exe + true + $(OutDir)testzlib.pdb + Console + + + + + X64 + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;NDEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDLL + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllRelease\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)testzlibdll.exe + true + Console + true + true + MachineX64 + + + + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;NDEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDLL + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllRelease\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)testzlibdll.exe + true + Console + true + true + + + + + + MaxSpeed + OnlyExplicitInline + true + ..\..\..;..\..\minizip;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;ZLIB_WINAPI;NDEBUG;_CONSOLE;WIN64;%(PreprocessorDefinitions) + true + Default + MultiThreadedDLL + false + true + + + $(IntDir) + Level3 + ProgramDatabase + + + x64\ZlibDllRelease\zlibwapi.lib;%(AdditionalDependencies) + $(OutDir)testzlibdll.exe + true + Console + true + true + + + + + + + + {8fd826f8-3739-44e6-8cc8-997122e53b8d} + + + + + + \ No newline at end of file diff --git a/third-party/zlib/contrib/vstudio/vc17/zlib.rc b/third-party/zlib/contrib/vstudio/vc17/zlib.rc new file mode 100644 index 00000000..a55f341c --- /dev/null +++ b/third-party/zlib/contrib/vstudio/vc17/zlib.rc @@ -0,0 +1,32 @@ +#include + +#define IDR_VERSION1 1 +IDR_VERSION1 VERSIONINFO MOVEABLE IMPURE LOADONCALL DISCARDABLE + FILEVERSION 1, 3, 1, 0 + PRODUCTVERSION 1, 3, 1, 0 + FILEFLAGSMASK VS_FFI_FILEFLAGSMASK + FILEFLAGS 0 + FILEOS VOS_DOS_WINDOWS32 + FILETYPE VFT_DLL + FILESUBTYPE 0 // not used +BEGIN + BLOCK "StringFileInfo" + BEGIN + BLOCK "040904E4" + //language ID = U.S. English, char set = Windows, Multilingual + + BEGIN + VALUE "FileDescription", "zlib data compression and ZIP file I/O library\0" + VALUE "FileVersion", "1.3.1\0" + VALUE "InternalName", "zlib\0" + VALUE "OriginalFilename", "zlibwapi.dll\0" + VALUE "ProductName", "ZLib.DLL\0" + VALUE "Comments","DLL support by Alessandro Iacopetti & Gilles Vollant\0" + VALUE "LegalCopyright", "(C) 1995-2024 Jean-loup Gailly & Mark Adler\0" + END + END + BLOCK "VarFileInfo" + BEGIN + VALUE "Translation", 0x0409, 1252 + END +END diff --git a/third-party/zlib/contrib/vstudio/vc17/zlibstat.vcxproj b/third-party/zlib/contrib/vstudio/vc17/zlibstat.vcxproj new file mode 100644 index 00000000..b946ac2a --- /dev/null +++ b/third-party/zlib/contrib/vstudio/vc17/zlibstat.vcxproj @@ -0,0 +1,602 @@ + + + + + Debug + ARM + + + Debug + ARM64 + + + Debug + Win32 + + + Debug + x64 + + + ReleaseWithoutAsm + ARM + + + ReleaseWithoutAsm + ARM64 + + + ReleaseWithoutAsm + Win32 + + + ReleaseWithoutAsm + x64 + + + Release + ARM + + + Release + ARM64 + + + Release + Win32 + + + Release + x64 + + + + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8} + 10.0 + + + + StaticLibrary + false + v143 + + + StaticLibrary + false + v143 + + + StaticLibrary + false + v143 + Unicode + + + StaticLibrary + false + v143 + + + StaticLibrary + false + v143 + + + StaticLibrary + false + v143 + + + StaticLibrary + false + v143 + + + StaticLibrary + false + v143 + + + StaticLibrary + false + v143 + + + StaticLibrary + false + v143 + + + StaticLibrary + false + v143 + + + StaticLibrary + false + v143 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + x86\ZlibStat$(Configuration)\ + x86\ZlibStat$(Configuration)\Tmp\ + x86\ZlibStat$(Configuration)\ + x86\ZlibStat$(Configuration)\Tmp\ + x86\ZlibStat$(Configuration)\ + x86\ZlibStat$(Configuration)\Tmp\ + x64\ZlibStat$(Configuration)\ + x64\ZlibStat$(Configuration)\Tmp\ + x64\ZlibStat$(Configuration)\ + x64\ZlibStat$(Configuration)\Tmp\ + x64\ZlibStat$(Configuration)\ + x64\ZlibStat$(Configuration)\Tmp\ + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + + + arm64\ZlibStat$(Configuration)\ + arm64\ZlibStat$(Configuration)\Tmp\ + + + arm64\ZlibStat$(Configuration)\ + arm64\ZlibStat$(Configuration)\Tmp\ + + + arm64\ZlibStat$(Configuration)\ + arm64\ZlibStat$(Configuration)\Tmp\ + + + arm\ZlibStat$(Configuration)\ + arm\ZlibStat$(Configuration)\Tmp\ + + + arm\ZlibStat$(Configuration)\ + arm\ZlibStat$(Configuration)\Tmp\ + + + arm\ZlibStat$(Configuration)\ + arm\ZlibStat$(Configuration)\Tmp\ + + + + Disabled + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + + + MultiThreadedDebugDLL + false + $(IntDir)zlibstat.pch + $(IntDir) + $(IntDir) + $(OutDir) + Level3 + true + OldStyle + + + 0x040c + + + /MACHINE:X86 /NODEFAULTLIB %(AdditionalOptions) + $(OutDir)zlibstat.lib + true + + + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + true + + + MultiThreaded + false + true + $(IntDir)zlibstat.pch + $(IntDir) + $(IntDir) + $(OutDir) + Level3 + true + + + 0x040c + + + /MACHINE:X86 /NODEFAULTLIB %(AdditionalOptions) + %(AdditionalDependencies) + $(OutDir)zlibstat.lib + true + + + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;ZLIB_WINAPI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) + true + + + MultiThreaded + false + true + $(IntDir)zlibstat.pch + $(IntDir) + $(IntDir) + $(OutDir) + Level3 + true + + + 0x040c + + + /MACHINE:X86 /NODEFAULTLIB %(AdditionalOptions) + $(OutDir)zlibstat.lib + true + + + + + X64 + + + Disabled + ..\..\..;%(AdditionalIncludeDirectories) + ZLIB_WINAPI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;WIN64;%(PreprocessorDefinitions) + + + MultiThreadedDebugDLL + false + $(IntDir)zlibstat.pch + $(IntDir) + $(IntDir) + $(OutDir) + Level3 + true + OldStyle + + + 0x040c + + + /MACHINE:AMD64 /NODEFAULTLIB %(AdditionalOptions) + $(OutDir)zlibstat.lib + true + + + + + + Disabled + ..\..\..;%(AdditionalIncludeDirectories) + ZLIB_WINAPI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;WIN64;%(PreprocessorDefinitions) + + + MultiThreadedDebugDLL + false + $(IntDir)zlibstat.pch + $(IntDir) + $(IntDir) + $(OutDir) + Level3 + true + OldStyle + + + 0x040c + + + /MACHINE:ARM64 /NODEFAULTLIB %(AdditionalOptions) + $(OutDir)zlibstat.lib + true + + + + + + Disabled + ..\..\..;%(AdditionalIncludeDirectories) + ZLIB_WINAPI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;WIN64;%(PreprocessorDefinitions) + + + MultiThreadedDebugDLL + false + $(IntDir)zlibstat.pch + $(IntDir) + $(IntDir) + $(OutDir) + Level3 + true + OldStyle + + + 0x040c + + + /MACHINE:ARM /NODEFAULTLIB %(AdditionalOptions) + $(OutDir)zlibstat.lib + true + + + + + X64 + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + ZLIB_WINAPI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;WIN64;%(PreprocessorDefinitions) + true + + + MultiThreadedDLL + false + true + $(IntDir)zlibstat.pch + $(IntDir) + $(IntDir) + $(OutDir) + Level3 + true + + + 0x040c + + + /MACHINE:AMD64 /NODEFAULTLIB %(AdditionalOptions) + %(AdditionalDependencies) + $(OutDir)zlibstat.lib + true + + + + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + ZLIB_WINAPI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;WIN64;%(PreprocessorDefinitions) + true + + + MultiThreadedDLL + false + true + $(IntDir)zlibstat.pch + $(IntDir) + $(IntDir) + $(OutDir) + Level3 + true + + + 0x040c + + + /MACHINE:ARM64 /NODEFAULTLIB %(AdditionalOptions) + %(AdditionalDependencies) + $(OutDir)zlibstat.lib + true + + + + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + ZLIB_WINAPI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;WIN64;%(PreprocessorDefinitions) + true + + + MultiThreadedDLL + false + true + $(IntDir)zlibstat.pch + $(IntDir) + $(IntDir) + $(OutDir) + Level3 + true + + + 0x040c + + + /MACHINE:ARM /NODEFAULTLIB %(AdditionalOptions) + %(AdditionalDependencies) + $(OutDir)zlibstat.lib + true + + + + + X64 + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + ZLIB_WINAPI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;WIN64;%(PreprocessorDefinitions) + true + + + MultiThreadedDLL + false + true + $(IntDir)zlibstat.pch + $(IntDir) + $(IntDir) + $(OutDir) + Level3 + true + + + 0x040c + + + /MACHINE:AMD64 /NODEFAULTLIB %(AdditionalOptions) + $(OutDir)zlibstat.lib + true + + + + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + ZLIB_WINAPI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;WIN64;%(PreprocessorDefinitions) + true + + + MultiThreadedDLL + false + true + $(IntDir)zlibstat.pch + $(IntDir) + $(IntDir) + $(OutDir) + Level3 + true + + + 0x040c + + + /MACHINE:ARM64 /NODEFAULTLIB %(AdditionalOptions) + $(OutDir)zlibstat.lib + true + + + + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + ZLIB_WINAPI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;WIN64;%(PreprocessorDefinitions) + true + + + MultiThreadedDLL + false + true + $(IntDir)zlibstat.pch + $(IntDir) + $(IntDir) + $(OutDir) + Level3 + true + + + 0x040c + + + /MACHINE:ARM /NODEFAULTLIB %(AdditionalOptions) + $(OutDir)zlibstat.lib + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/third-party/zlib/contrib/vstudio/vc17/zlibvc.def b/third-party/zlib/contrib/vstudio/vc17/zlibvc.def new file mode 100644 index 00000000..53947cc3 --- /dev/null +++ b/third-party/zlib/contrib/vstudio/vc17/zlibvc.def @@ -0,0 +1,158 @@ +LIBRARY +; zlib data compression and ZIP file I/O library + +VERSION 1.3.1 + +EXPORTS + adler32 @1 + compress @2 + crc32 @3 + deflate @4 + deflateCopy @5 + deflateEnd @6 + deflateInit2_ @7 + deflateInit_ @8 + deflateParams @9 + deflateReset @10 + deflateSetDictionary @11 + gzclose @12 + gzdopen @13 + gzerror @14 + gzflush @15 + gzopen @16 + gzread @17 + gzwrite @18 + inflate @19 + inflateEnd @20 + inflateInit2_ @21 + inflateInit_ @22 + inflateReset @23 + inflateSetDictionary @24 + inflateSync @25 + uncompress @26 + zlibVersion @27 + gzprintf @28 + gzputc @29 + gzgetc @30 + gzseek @31 + gzrewind @32 + gztell @33 + gzeof @34 + gzsetparams @35 + zError @36 + inflateSyncPoint @37 + get_crc_table @38 + compress2 @39 + gzputs @40 + gzgets @41 + inflateCopy @42 + inflateBackInit_ @43 + inflateBack @44 + inflateBackEnd @45 + compressBound @46 + deflateBound @47 + gzclearerr @48 + gzungetc @49 + zlibCompileFlags @50 + deflatePrime @51 + deflatePending @52 + + unzOpen @61 + unzClose @62 + unzGetGlobalInfo @63 + unzGetCurrentFileInfo @64 + unzGoToFirstFile @65 + unzGoToNextFile @66 + unzOpenCurrentFile @67 + unzReadCurrentFile @68 + unzOpenCurrentFile3 @69 + unztell @70 + unzeof @71 + unzCloseCurrentFile @72 + unzGetGlobalComment @73 + unzStringFileNameCompare @74 + unzLocateFile @75 + unzGetLocalExtrafield @76 + unzOpen2 @77 + unzOpenCurrentFile2 @78 + unzOpenCurrentFilePassword @79 + + zipOpen @80 + zipOpenNewFileInZip @81 + zipWriteInFileInZip @82 + zipCloseFileInZip @83 + zipClose @84 + zipOpenNewFileInZip2 @86 + zipCloseFileInZipRaw @87 + zipOpen2 @88 + zipOpenNewFileInZip3 @89 + + unzGetFilePos @100 + unzGoToFilePos @101 + + fill_win32_filefunc @110 + +; zlibwapi v1.2.4 added: + fill_win32_filefunc64 @111 + fill_win32_filefunc64A @112 + fill_win32_filefunc64W @113 + + unzOpen64 @120 + unzOpen2_64 @121 + unzGetGlobalInfo64 @122 + unzGetCurrentFileInfo64 @124 + unzGetCurrentFileZStreamPos64 @125 + unztell64 @126 + unzGetFilePos64 @127 + unzGoToFilePos64 @128 + + zipOpen64 @130 + zipOpen2_64 @131 + zipOpenNewFileInZip64 @132 + zipOpenNewFileInZip2_64 @133 + zipOpenNewFileInZip3_64 @134 + zipOpenNewFileInZip4_64 @135 + zipCloseFileInZipRaw64 @136 + +; zlib1 v1.2.4 added: + adler32_combine @140 + crc32_combine @142 + deflateSetHeader @144 + deflateTune @145 + gzbuffer @146 + gzclose_r @147 + gzclose_w @148 + gzdirect @149 + gzoffset @150 + inflateGetHeader @156 + inflateMark @157 + inflatePrime @158 + inflateReset2 @159 + inflateUndermine @160 + +; zlib1 v1.2.6 added: + gzgetc_ @161 + inflateResetKeep @163 + deflateResetKeep @164 + +; zlib1 v1.2.7 added: + gzopen_w @165 + +; zlib1 v1.2.8 added: + inflateGetDictionary @166 + gzvprintf @167 + +; zlib1 v1.2.9 added: + inflateCodesUsed @168 + inflateValidate @169 + uncompress2 @170 + gzfread @171 + gzfwrite @172 + deflateGetDictionary @173 + adler32_z @174 + crc32_z @175 + +; zlib1 v1.2.12 added: + crc32_combine_gen @176 + crc32_combine_gen64 @177 + crc32_combine_op @178 diff --git a/third-party/zlib/contrib/vstudio/vc17/zlibvc.sln b/third-party/zlib/contrib/vstudio/vc17/zlibvc.sln new file mode 100644 index 00000000..67896b74 --- /dev/null +++ b/third-party/zlib/contrib/vstudio/vc17/zlibvc.sln @@ -0,0 +1,179 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.4.33015.44 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zlibvc", "zlibvc.vcxproj", "{8FD826F8-3739-44E6-8CC8-997122E53B8D}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zlibstat", "zlibstat.vcxproj", "{745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "testzlib", "testzlib.vcxproj", "{AA6666AA-E09F-4135-9C0C-4FE50C3C654B}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "testzlibdll", "testzlibdll.vcxproj", "{C52F9E7B-498A-42BE-8DB4-85A15694366A}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "minizip", "minizip.vcxproj", "{48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "miniunz", "miniunz.vcxproj", "{C52F9E7B-498A-42BE-8DB4-85A15694382A}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|ARM = Debug|ARM + Debug|ARM64 = Debug|ARM64 + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|ARM = Release|ARM + Release|ARM64 = Release|ARM64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + ReleaseWithoutAsm|ARM = ReleaseWithoutAsm|ARM + ReleaseWithoutAsm|ARM64 = ReleaseWithoutAsm|ARM64 + ReleaseWithoutAsm|Win32 = ReleaseWithoutAsm|Win32 + ReleaseWithoutAsm|x64 = ReleaseWithoutAsm|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Debug|ARM.ActiveCfg = Debug|ARM + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Debug|ARM.Build.0 = Debug|ARM + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Debug|ARM64.ActiveCfg = Debug|ARM64 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Debug|ARM64.Build.0 = Debug|ARM64 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Debug|Win32.ActiveCfg = Debug|Win32 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Debug|Win32.Build.0 = Debug|Win32 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Debug|x64.ActiveCfg = Debug|x64 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Debug|x64.Build.0 = Debug|x64 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Release|ARM.ActiveCfg = Release|ARM + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Release|ARM.Build.0 = Release|ARM + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Release|ARM64.ActiveCfg = Release|ARM64 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Release|ARM64.Build.0 = Release|ARM64 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Release|Win32.ActiveCfg = Release|Win32 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Release|Win32.Build.0 = Release|Win32 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Release|x64.ActiveCfg = Release|x64 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.Release|x64.Build.0 = Release|x64 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.ReleaseWithoutAsm|ARM.ActiveCfg = ReleaseWithoutAsm|ARM + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.ReleaseWithoutAsm|ARM.Build.0 = ReleaseWithoutAsm|ARM + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.ReleaseWithoutAsm|ARM64.ActiveCfg = ReleaseWithoutAsm|ARM64 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.ReleaseWithoutAsm|ARM64.Build.0 = ReleaseWithoutAsm|ARM64 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.ReleaseWithoutAsm|Win32.ActiveCfg = ReleaseWithoutAsm|Win32 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.ReleaseWithoutAsm|Win32.Build.0 = ReleaseWithoutAsm|Win32 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.ReleaseWithoutAsm|x64.ActiveCfg = ReleaseWithoutAsm|x64 + {8FD826F8-3739-44E6-8CC8-997122E53B8D}.ReleaseWithoutAsm|x64.Build.0 = ReleaseWithoutAsm|x64 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Debug|ARM.ActiveCfg = Debug|ARM + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Debug|ARM.Build.0 = Debug|ARM + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Debug|ARM64.ActiveCfg = Debug|ARM64 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Debug|ARM64.Build.0 = Debug|ARM64 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Debug|Win32.ActiveCfg = Debug|Win32 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Debug|Win32.Build.0 = Debug|Win32 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Debug|x64.ActiveCfg = Debug|x64 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Debug|x64.Build.0 = Debug|x64 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Release|ARM.ActiveCfg = Release|ARM + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Release|ARM.Build.0 = Release|ARM + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Release|ARM64.ActiveCfg = Release|ARM64 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Release|ARM64.Build.0 = Release|ARM64 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Release|Win32.ActiveCfg = Release|Win32 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Release|Win32.Build.0 = Release|Win32 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Release|x64.ActiveCfg = Release|x64 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.Release|x64.Build.0 = Release|x64 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.ReleaseWithoutAsm|ARM.ActiveCfg = ReleaseWithoutAsm|ARM + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.ReleaseWithoutAsm|ARM.Build.0 = ReleaseWithoutAsm|ARM + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.ReleaseWithoutAsm|ARM64.ActiveCfg = ReleaseWithoutAsm|ARM64 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.ReleaseWithoutAsm|ARM64.Build.0 = ReleaseWithoutAsm|ARM64 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.ReleaseWithoutAsm|Win32.ActiveCfg = ReleaseWithoutAsm|Win32 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.ReleaseWithoutAsm|Win32.Build.0 = ReleaseWithoutAsm|Win32 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.ReleaseWithoutAsm|x64.ActiveCfg = ReleaseWithoutAsm|x64 + {745DEC58-EBB3-47A9-A9B8-4C6627C01BF8}.ReleaseWithoutAsm|x64.Build.0 = ReleaseWithoutAsm|x64 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Debug|ARM.ActiveCfg = Debug|ARM + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Debug|ARM.Build.0 = Debug|ARM + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Debug|ARM64.ActiveCfg = Debug|ARM64 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Debug|ARM64.Build.0 = Debug|ARM64 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Debug|Win32.ActiveCfg = Debug|Win32 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Debug|Win32.Build.0 = Debug|Win32 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Debug|x64.ActiveCfg = Debug|x64 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Debug|x64.Build.0 = Debug|x64 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Release|ARM.ActiveCfg = Release|ARM + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Release|ARM.Build.0 = Release|ARM + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Release|ARM64.ActiveCfg = Release|ARM64 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Release|ARM64.Build.0 = Release|ARM64 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Release|Win32.ActiveCfg = Release|Win32 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Release|Win32.Build.0 = Release|Win32 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Release|x64.ActiveCfg = Release|x64 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.Release|x64.Build.0 = Release|x64 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|ARM.ActiveCfg = ReleaseWithoutAsm|ARM + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|ARM.Build.0 = ReleaseWithoutAsm|ARM + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|ARM64.ActiveCfg = ReleaseWithoutAsm|ARM64 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|ARM64.Build.0 = ReleaseWithoutAsm|ARM64 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|Win32.ActiveCfg = ReleaseWithoutAsm|Win32 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|Win32.Build.0 = ReleaseWithoutAsm|Win32 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|x64.ActiveCfg = ReleaseWithoutAsm|x64 + {AA6666AA-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|x64.Build.0 = ReleaseWithoutAsm|x64 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Debug|ARM.ActiveCfg = Debug|ARM + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Debug|ARM.Build.0 = Debug|ARM + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Debug|ARM64.ActiveCfg = Debug|ARM64 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Debug|ARM64.Build.0 = Debug|ARM64 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Debug|Win32.ActiveCfg = Debug|Win32 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Debug|Win32.Build.0 = Debug|Win32 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Debug|x64.ActiveCfg = Debug|x64 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Debug|x64.Build.0 = Debug|x64 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Release|ARM.ActiveCfg = Release|ARM + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Release|ARM.Build.0 = Release|ARM + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Release|ARM64.ActiveCfg = Release|ARM64 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Release|ARM64.Build.0 = Release|ARM64 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Release|Win32.ActiveCfg = Release|Win32 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Release|Win32.Build.0 = Release|Win32 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Release|x64.ActiveCfg = Release|x64 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.Release|x64.Build.0 = Release|x64 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.ReleaseWithoutAsm|ARM.ActiveCfg = Release|ARM + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.ReleaseWithoutAsm|ARM.Build.0 = Release|ARM + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.ReleaseWithoutAsm|ARM64.ActiveCfg = Release|ARM64 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.ReleaseWithoutAsm|ARM64.Build.0 = Release|ARM64 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.ReleaseWithoutAsm|Win32.ActiveCfg = Release|Win32 + {C52F9E7B-498A-42BE-8DB4-85A15694366A}.ReleaseWithoutAsm|x64.ActiveCfg = Release|x64 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Debug|ARM.ActiveCfg = Debug|ARM + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Debug|ARM.Build.0 = Debug|ARM + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Debug|ARM64.ActiveCfg = Debug|ARM64 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Debug|ARM64.Build.0 = Debug|ARM64 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Debug|Win32.ActiveCfg = Debug|Win32 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Debug|Win32.Build.0 = Debug|Win32 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Debug|x64.ActiveCfg = Debug|x64 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Debug|x64.Build.0 = Debug|x64 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Release|ARM.ActiveCfg = Release|ARM + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Release|ARM.Build.0 = Release|ARM + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Release|ARM64.ActiveCfg = Release|ARM64 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Release|ARM64.Build.0 = Release|ARM64 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Release|Win32.ActiveCfg = Release|Win32 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Release|Win32.Build.0 = Release|Win32 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Release|x64.ActiveCfg = Release|x64 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.Release|x64.Build.0 = Release|x64 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|ARM.ActiveCfg = Release|ARM + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|ARM.Build.0 = Release|ARM + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|ARM64.ActiveCfg = Release|ARM64 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|ARM64.Build.0 = Release|ARM64 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|Win32.ActiveCfg = Release|Win32 + {48CDD9DC-E09F-4135-9C0C-4FE50C3C654B}.ReleaseWithoutAsm|x64.ActiveCfg = Release|x64 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Debug|ARM.ActiveCfg = Debug|ARM + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Debug|ARM.Build.0 = Debug|ARM + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Debug|ARM64.ActiveCfg = Debug|ARM64 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Debug|ARM64.Build.0 = Debug|ARM64 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Debug|Win32.ActiveCfg = Debug|Win32 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Debug|Win32.Build.0 = Debug|Win32 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Debug|x64.ActiveCfg = Debug|x64 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Debug|x64.Build.0 = Debug|x64 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Release|ARM.ActiveCfg = Release|ARM + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Release|ARM.Build.0 = Release|ARM + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Release|ARM64.ActiveCfg = Release|ARM64 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Release|ARM64.Build.0 = Release|ARM64 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Release|Win32.ActiveCfg = Release|Win32 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Release|Win32.Build.0 = Release|Win32 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Release|x64.ActiveCfg = Release|x64 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.Release|x64.Build.0 = Release|x64 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.ReleaseWithoutAsm|ARM.ActiveCfg = Release|ARM + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.ReleaseWithoutAsm|ARM.Build.0 = Release|ARM + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.ReleaseWithoutAsm|ARM64.ActiveCfg = Release|ARM64 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.ReleaseWithoutAsm|ARM64.Build.0 = Release|ARM64 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.ReleaseWithoutAsm|Win32.ActiveCfg = Release|Win32 + {C52F9E7B-498A-42BE-8DB4-85A15694382A}.ReleaseWithoutAsm|x64.ActiveCfg = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {EAA58685-56D9-43F2-8703-FD2CB020745E} + EndGlobalSection +EndGlobal diff --git a/third-party/zlib/contrib/vstudio/vc17/zlibvc.vcxproj b/third-party/zlib/contrib/vstudio/vc17/zlibvc.vcxproj new file mode 100644 index 00000000..10a7a901 --- /dev/null +++ b/third-party/zlib/contrib/vstudio/vc17/zlibvc.vcxproj @@ -0,0 +1,875 @@ + + + + + Debug + ARM + + + Debug + ARM64 + + + Debug + Win32 + + + Debug + x64 + + + ReleaseWithoutAsm + ARM + + + ReleaseWithoutAsm + ARM64 + + + ReleaseWithoutAsm + Win32 + + + ReleaseWithoutAsm + x64 + + + Release + ARM + + + Release + ARM64 + + + Release + Win32 + + + Release + x64 + + + + {8FD826F8-3739-44E6-8CC8-997122E53B8D} + 10.0 + + + + DynamicLibrary + false + true + v143 + + + DynamicLibrary + false + true + v143 + + + DynamicLibrary + false + v143 + Unicode + + + DynamicLibrary + false + true + v143 + + + DynamicLibrary + false + true + v143 + + + DynamicLibrary + false + true + v143 + + + DynamicLibrary + false + true + v143 + + + DynamicLibrary + false + true + v143 + + + DynamicLibrary + false + true + v143 + + + DynamicLibrary + false + v143 + + + DynamicLibrary + false + v143 + + + DynamicLibrary + false + v143 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30128.1 + x86\ZlibDll$(Configuration)\ + x86\ZlibDll$(Configuration)\Tmp\ + true + false + x86\ZlibDll$(Configuration)\ + x86\ZlibDll$(Configuration)\Tmp\ + false + false + x86\ZlibDll$(Configuration)\ + x86\ZlibDll$(Configuration)\Tmp\ + false + false + x64\ZlibDll$(Configuration)\ + x64\ZlibDll$(Configuration)\Tmp\ + true + true + true + false + false + false + x64\ZlibDll$(Configuration)\ + x64\ZlibDll$(Configuration)\Tmp\ + false + false + false + false + false + false + x64\ZlibDll$(Configuration)\ + x64\ZlibDll$(Configuration)\Tmp\ + false + false + false + false + false + false + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + AllRules.ruleset + + + AllRules.ruleset + AllRules.ruleset + AllRules.ruleset + + + + + + + zlibwapi + zlibwapi + zlibwapi + zlibwapi + zlibwapi + zlibwapi + zlibwapi + zlibwapi + zlibwapi + zlibwapi + zlibwapi + zlibwapi + + + arm64\ZlibDll$(Configuration)\ + arm64\ZlibDll$(Configuration)\Tmp\ + + + arm\ZlibDll$(Configuration)\ + arm\ZlibDll$(Configuration)\Tmp\ + + + arm64\ZlibDll$(Configuration)\ + arm64\ZlibDll$(Configuration)\Tmp\ + + + arm64\ZlibDll$(Configuration)\ + arm64\ZlibDll$(Configuration)\Tmp\ + + + arm\ZlibDll$(Configuration)\ + arm\ZlibDll$(Configuration)\Tmp\ + + + arm\ZlibDll$(Configuration)\ + arm\ZlibDll$(Configuration)\Tmp\ + + + + _DEBUG;%(PreprocessorDefinitions) + true + true + Win32 + $(OutDir)zlibvc.tlb + + + Disabled + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;ZLIB_WINAPI;%(PreprocessorDefinitions) + + + MultiThreadedDebugDLL + false + $(IntDir)zlibvc.pch + $(IntDir) + $(IntDir) + $(OutDir) + + + Level3 + true + ProgramDatabase + + + _DEBUG;%(PreprocessorDefinitions) + 0x040c + + + /MACHINE:I386 %(AdditionalOptions) + %(AdditionalDependencies) + $(OutDir)zlibwapi.dll + true + .\zlibvc.def + true + $(OutDir)zlibwapi.pdb + true + $(OutDir)zlibwapi.map + Windows + false + + + $(OutDir)zlibwapi.lib + + + + + NDEBUG;%(PreprocessorDefinitions) + true + true + Win32 + $(OutDir)zlibvc.tlb + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;ZLIB_WINAPI;%(PreprocessorDefinitions) + true + + + MultiThreadedDLL + false + true + $(IntDir)zlibvc.pch + All + $(IntDir) + $(IntDir) + $(OutDir) + + + Level3 + true + + + NDEBUG;%(PreprocessorDefinitions) + 0x040c + + + /MACHINE:I386 %(AdditionalOptions) + $(OutDir)zlibwapi.dll + true + false + .\zlibvc.def + $(OutDir)zlibwapi.pdb + true + $(OutDir)zlibwapi.map + Windows + false + + + $(OutDir)zlibwapi.lib + + + + + NDEBUG;%(PreprocessorDefinitions) + true + true + Win32 + $(OutDir)zlibvc.tlb + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;ZLIB_WINAPI;%(PreprocessorDefinitions) + true + + + MultiThreaded + false + true + $(IntDir)zlibvc.pch + All + $(IntDir) + $(IntDir) + $(OutDir) + + + Level3 + true + + + NDEBUG;%(PreprocessorDefinitions) + 0x040c + + + /MACHINE:I386 %(AdditionalOptions) + %(AdditionalDependencies) + $(OutDir)zlibwapi.dll + true + false + .\zlibvc.def + $(OutDir)zlibwapi.pdb + true + $(OutDir)zlibwapi.map + Windows + false + + + $(OutDir)zlibwapi.lib + false + + + + + _DEBUG;%(PreprocessorDefinitions) + true + true + X64 + $(OutDir)zlibvc.tlb + + + Disabled + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;ZLIB_WINAPI;WIN64;%(PreprocessorDefinitions) + + + MultiThreadedDebugDLL + false + $(IntDir)zlibvc.pch + $(IntDir) + $(IntDir) + $(OutDir) + + + Level3 + true + ProgramDatabase + + + _DEBUG;%(PreprocessorDefinitions) + 0x040c + + + %(AdditionalDependencies) + $(OutDir)zlibwapi.dll + true + .\zlibvc.def + true + $(OutDir)zlibwapi.pdb + true + $(OutDir)zlibwapi.map + Windows + $(OutDir)zlibwapi.lib + MachineX64 + + + + + _DEBUG;%(PreprocessorDefinitions) + true + true + $(OutDir)zlibvc.tlb + + + Disabled + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;ZLIB_WINAPI;WIN64;%(PreprocessorDefinitions) + + + MultiThreadedDebugDLL + false + $(IntDir)zlibvc.pch + $(IntDir) + $(IntDir) + $(OutDir) + + + Level3 + true + ProgramDatabase + + + _DEBUG;%(PreprocessorDefinitions) + 0x040c + + + %(AdditionalDependencies) + $(OutDir)zlibwapi.dll + true + .\zlibvc.def + true + $(OutDir)zlibwapi.pdb + true + $(OutDir)zlibwapi.map + Windows + $(OutDir)zlibwapi.lib + + + + + _DEBUG;%(PreprocessorDefinitions) + true + true + $(OutDir)zlibvc.tlb + + + Disabled + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;ZLIB_WINAPI;WIN64;%(PreprocessorDefinitions) + + + MultiThreadedDebugDLL + false + $(IntDir)zlibvc.pch + $(IntDir) + $(IntDir) + $(OutDir) + + + Level3 + true + ProgramDatabase + + + _DEBUG;%(PreprocessorDefinitions) + 0x040c + + + %(AdditionalDependencies) + $(OutDir)zlibwapi.dll + true + .\zlibvc.def + true + $(OutDir)zlibwapi.pdb + true + $(OutDir)zlibwapi.map + Windows + $(OutDir)zlibwapi.lib + + + + + NDEBUG;%(PreprocessorDefinitions) + true + true + X64 + $(OutDir)zlibvc.tlb + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;ZLIB_WINAPI;WIN64;%(PreprocessorDefinitions) + true + + + MultiThreadedDLL + false + true + $(IntDir)zlibvc.pch + All + $(IntDir) + $(IntDir) + $(OutDir) + + + Level3 + true + + + NDEBUG;%(PreprocessorDefinitions) + 0x040c + + + $(OutDir)zlibwapi.dll + true + false + .\zlibvc.def + $(OutDir)zlibwapi.pdb + true + $(OutDir)zlibwapi.map + Windows + $(OutDir)zlibwapi.lib + MachineX64 + + + + + NDEBUG;%(PreprocessorDefinitions) + true + true + $(OutDir)zlibvc.tlb + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;ZLIB_WINAPI;WIN64;%(PreprocessorDefinitions) + true + + + MultiThreadedDLL + false + true + $(IntDir)zlibvc.pch + All + $(IntDir) + $(IntDir) + $(OutDir) + + + Level3 + true + + + NDEBUG;%(PreprocessorDefinitions) + 0x040c + + + $(OutDir)zlibwapi.dll + true + false + .\zlibvc.def + $(OutDir)zlibwapi.pdb + true + $(OutDir)zlibwapi.map + Windows + $(OutDir)zlibwapi.lib + + + + + NDEBUG;%(PreprocessorDefinitions) + true + true + $(OutDir)zlibvc.tlb + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + WIN32;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;ZLIB_WINAPI;WIN64;%(PreprocessorDefinitions) + true + + + MultiThreadedDLL + false + true + $(IntDir)zlibvc.pch + All + $(IntDir) + $(IntDir) + $(OutDir) + + + Level3 + true + + + NDEBUG;%(PreprocessorDefinitions) + 0x040c + + + $(OutDir)zlibwapi.dll + true + false + .\zlibvc.def + $(OutDir)zlibwapi.pdb + true + $(OutDir)zlibwapi.map + Windows + $(OutDir)zlibwapi.lib + + + + + NDEBUG;%(PreprocessorDefinitions) + true + true + X64 + $(OutDir)zlibvc.tlb + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;ZLIB_WINAPI;WIN64;%(PreprocessorDefinitions) + true + + + MultiThreadedDLL + false + true + $(IntDir)zlibvc.pch + All + $(IntDir) + $(IntDir) + $(OutDir) + + + Level3 + true + + + NDEBUG;%(PreprocessorDefinitions) + 0x040c + + + %(AdditionalDependencies) + $(OutDir)zlibwapi.dll + true + false + .\zlibvc.def + $(OutDir)zlibwapi.pdb + true + $(OutDir)zlibwapi.map + Windows + $(OutDir)zlibwapi.lib + MachineX64 + + + + + NDEBUG;%(PreprocessorDefinitions) + true + true + $(OutDir)zlibvc.tlb + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;ZLIB_WINAPI;WIN64;%(PreprocessorDefinitions) + true + + + MultiThreadedDLL + false + true + $(IntDir)zlibvc.pch + All + $(IntDir) + $(IntDir) + $(OutDir) + + + Level3 + true + + + NDEBUG;%(PreprocessorDefinitions) + 0x040c + + + %(AdditionalDependencies) + $(OutDir)zlibwapi.dll + true + false + .\zlibvc.def + $(OutDir)zlibwapi.pdb + true + $(OutDir)zlibwapi.map + Windows + $(OutDir)zlibwapi.lib + + + + + NDEBUG;%(PreprocessorDefinitions) + true + true + $(OutDir)zlibvc.tlb + + + OnlyExplicitInline + ..\..\..;%(AdditionalIncludeDirectories) + _CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;ZLIB_WINAPI;WIN32;%(PreprocessorDefinitions) + true + + + MultiThreadedDLL + false + true + $(IntDir)zlibvc.pch + All + $(IntDir) + $(IntDir) + $(OutDir) + + + Level3 + true + + + NDEBUG;%(PreprocessorDefinitions) + 0x040c + + + %(AdditionalDependencies) + $(OutDir)zlibwapi.dll + true + false + .\zlibvc.def + $(OutDir)zlibwapi.pdb + true + $(OutDir)zlibwapi.map + Windows + $(OutDir)zlibwapi.lib + + + + + + + + + + + + + + + + + + + + + %(AdditionalIncludeDirectories) + ZLIB_INTERNAL;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + %(AdditionalIncludeDirectories) + %(AdditionalIncludeDirectories) + ZLIB_INTERNAL;%(PreprocessorDefinitions) + ZLIB_INTERNAL;%(PreprocessorDefinitions) + ZLIB_INTERNAL;%(PreprocessorDefinitions) + + + %(AdditionalIncludeDirectories) + ZLIB_INTERNAL;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + %(AdditionalIncludeDirectories) + %(AdditionalIncludeDirectories) + ZLIB_INTERNAL;%(PreprocessorDefinitions) + ZLIB_INTERNAL;%(PreprocessorDefinitions) + ZLIB_INTERNAL;%(PreprocessorDefinitions) + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/third-party/zlib/contrib/vstudio/vc9/zlib.rc b/third-party/zlib/contrib/vstudio/vc9/zlib.rc index 29af8e11..856bd11f 100644 --- a/third-party/zlib/contrib/vstudio/vc9/zlib.rc +++ b/third-party/zlib/contrib/vstudio/vc9/zlib.rc @@ -2,8 +2,8 @@ #define IDR_VERSION1 1 IDR_VERSION1 VERSIONINFO MOVEABLE IMPURE LOADONCALL DISCARDABLE - FILEVERSION 1, 3, 0, 0 - PRODUCTVERSION 1, 3, 0, 0 + FILEVERSION 1, 3, 1, 0 + PRODUCTVERSION 1, 3, 1, 0 FILEFLAGSMASK VS_FFI_FILEFLAGSMASK FILEFLAGS 0 FILEOS VOS_DOS_WINDOWS32 @@ -17,12 +17,12 @@ BEGIN BEGIN VALUE "FileDescription", "zlib data compression and ZIP file I/O library\0" - VALUE "FileVersion", "1.3.0\0" + VALUE "FileVersion", "1.3.1\0" VALUE "InternalName", "zlib\0" VALUE "OriginalFilename", "zlibwapi.dll\0" VALUE "ProductName", "ZLib.DLL\0" VALUE "Comments","DLL support by Alessandro Iacopetti & Gilles Vollant\0" - VALUE "LegalCopyright", "(C) 1995-2023 Jean-loup Gailly & Mark Adler\0" + VALUE "LegalCopyright", "(C) 1995-2024 Jean-loup Gailly & Mark Adler\0" END END BLOCK "VarFileInfo" diff --git a/third-party/zlib/contrib/vstudio/vc9/zlibvc.def b/third-party/zlib/contrib/vstudio/vc9/zlibvc.def index f28aa6c7..3234a02d 100644 --- a/third-party/zlib/contrib/vstudio/vc9/zlibvc.def +++ b/third-party/zlib/contrib/vstudio/vc9/zlibvc.def @@ -1,7 +1,7 @@ LIBRARY ; zlib data compression and ZIP file I/O library -VERSION 1.3 +VERSION 1.3.1 EXPORTS adler32 @1 diff --git a/third-party/zlib/deflate.c b/third-party/zlib/deflate.c index bd011751..012ea814 100644 --- a/third-party/zlib/deflate.c +++ b/third-party/zlib/deflate.c @@ -1,5 +1,5 @@ /* deflate.c -- compress data using the deflation algorithm - * Copyright (C) 1995-2023 Jean-loup Gailly and Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -52,7 +52,7 @@ #include "deflate.h" const char deflate_copyright[] = - " deflate 1.3 Copyright 1995-2023 Jean-loup Gailly and Mark Adler "; + " deflate 1.3.1 Copyright 1995-2024 Jean-loup Gailly and Mark Adler "; /* If you use the zlib library in a product, an acknowledgment is welcome in the documentation of your product. If for some reason you cannot @@ -493,7 +493,7 @@ int ZEXPORT deflateInit2_(z_streamp strm, int level, int method, * symbols from which it is being constructed. */ - s->pending_buf = (uchf *) ZALLOC(strm, s->lit_bufsize, 4); + s->pending_buf = (uchf *) ZALLOC(strm, s->lit_bufsize, LIT_BUFS); s->pending_buf_size = (ulg)s->lit_bufsize * 4; if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL || @@ -503,8 +503,14 @@ int ZEXPORT deflateInit2_(z_streamp strm, int level, int method, deflateEnd (strm); return Z_MEM_ERROR; } +#ifdef LIT_MEM + s->d_buf = (ushf *)(s->pending_buf + (s->lit_bufsize << 1)); + s->l_buf = s->pending_buf + (s->lit_bufsize << 2); + s->sym_end = s->lit_bufsize - 1; +#else s->sym_buf = s->pending_buf + s->lit_bufsize; s->sym_end = (s->lit_bufsize - 1) * 3; +#endif /* We avoid equality with lit_bufsize*3 because of wraparound at 64K * on 16 bit machines and because stored blocks are restricted to * 64K-1 bytes. @@ -720,9 +726,15 @@ int ZEXPORT deflatePrime(z_streamp strm, int bits, int value) { if (deflateStateCheck(strm)) return Z_STREAM_ERROR; s = strm->state; +#ifdef LIT_MEM + if (bits < 0 || bits > 16 || + (uchf *)s->d_buf < s->pending_out + ((Buf_size + 7) >> 3)) + return Z_BUF_ERROR; +#else if (bits < 0 || bits > 16 || s->sym_buf < s->pending_out + ((Buf_size + 7) >> 3)) return Z_BUF_ERROR; +#endif do { put = Buf_size - s->bi_valid; if (put > bits) @@ -1294,7 +1306,7 @@ int ZEXPORT deflateCopy(z_streamp dest, z_streamp source) { ds->window = (Bytef *) ZALLOC(dest, ds->w_size, 2*sizeof(Byte)); ds->prev = (Posf *) ZALLOC(dest, ds->w_size, sizeof(Pos)); ds->head = (Posf *) ZALLOC(dest, ds->hash_size, sizeof(Pos)); - ds->pending_buf = (uchf *) ZALLOC(dest, ds->lit_bufsize, 4); + ds->pending_buf = (uchf *) ZALLOC(dest, ds->lit_bufsize, LIT_BUFS); if (ds->window == Z_NULL || ds->prev == Z_NULL || ds->head == Z_NULL || ds->pending_buf == Z_NULL) { @@ -1305,10 +1317,15 @@ int ZEXPORT deflateCopy(z_streamp dest, z_streamp source) { zmemcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte)); zmemcpy((voidpf)ds->prev, (voidpf)ss->prev, ds->w_size * sizeof(Pos)); zmemcpy((voidpf)ds->head, (voidpf)ss->head, ds->hash_size * sizeof(Pos)); - zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size); + zmemcpy(ds->pending_buf, ss->pending_buf, ds->lit_bufsize * LIT_BUFS); ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf); +#ifdef LIT_MEM + ds->d_buf = (ushf *)(ds->pending_buf + (ds->lit_bufsize << 1)); + ds->l_buf = ds->pending_buf + (ds->lit_bufsize << 2); +#else ds->sym_buf = ds->pending_buf + ds->lit_bufsize; +#endif ds->l_desc.dyn_tree = ds->dyn_ltree; ds->d_desc.dyn_tree = ds->dyn_dtree; @@ -1539,13 +1556,21 @@ local uInt longest_match(deflate_state *s, IPos cur_match) { */ local void check_match(deflate_state *s, IPos start, IPos match, int length) { /* check that the match is indeed a match */ - if (zmemcmp(s->window + match, - s->window + start, length) != EQUAL) { - fprintf(stderr, " start %u, match %u, length %d\n", - start, match, length); + Bytef *back = s->window + (int)match, *here = s->window + start; + IPos len = length; + if (match == (IPos)-1) { + /* match starts one byte before the current window -- just compare the + subsequent length-1 bytes */ + back++; + here++; + len--; + } + if (zmemcmp(back, here, len) != EQUAL) { + fprintf(stderr, " start %u, match %d, length %d\n", + start, (int)match, length); do { - fprintf(stderr, "%c%c", s->window[match++], s->window[start++]); - } while (--length != 0); + fprintf(stderr, "(%02x %02x)", *back++, *here++); + } while (--len != 0); z_error("invalid match"); } if (z_verbose > 1) { diff --git a/third-party/zlib/deflate.h b/third-party/zlib/deflate.h index 86967914..300c6ada 100644 --- a/third-party/zlib/deflate.h +++ b/third-party/zlib/deflate.h @@ -1,5 +1,5 @@ /* deflate.h -- internal compression state - * Copyright (C) 1995-2018 Jean-loup Gailly + * Copyright (C) 1995-2024 Jean-loup Gailly * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -23,6 +23,10 @@ # define GZIP #endif +/* define LIT_MEM to slightly increase the speed of deflate (order 1% to 2%) at + the cost of a larger memory footprint */ +/* #define LIT_MEM */ + /* =========================================================================== * Internal compression state. */ @@ -217,7 +221,14 @@ typedef struct internal_state { /* Depth of each subtree used as tie breaker for trees of equal frequency */ +#ifdef LIT_MEM +# define LIT_BUFS 5 + ushf *d_buf; /* buffer for distances */ + uchf *l_buf; /* buffer for literals/lengths */ +#else +# define LIT_BUFS 4 uchf *sym_buf; /* buffer for distances and literals/lengths */ +#endif uInt lit_bufsize; /* Size of match buffer for literals/lengths. There are 4 reasons for @@ -239,7 +250,7 @@ typedef struct internal_state { * - I can't count above 4 */ - uInt sym_next; /* running index in sym_buf */ + uInt sym_next; /* running index in symbol buffer */ uInt sym_end; /* symbol table full when sym_next reaches this */ ulg opt_len; /* bit length of current block with optimal trees */ @@ -318,6 +329,25 @@ void ZLIB_INTERNAL _tr_stored_block(deflate_state *s, charf *buf, extern const uch ZLIB_INTERNAL _dist_code[]; #endif +#ifdef LIT_MEM +# define _tr_tally_lit(s, c, flush) \ + { uch cc = (c); \ + s->d_buf[s->sym_next] = 0; \ + s->l_buf[s->sym_next++] = cc; \ + s->dyn_ltree[cc].Freq++; \ + flush = (s->sym_next == s->sym_end); \ + } +# define _tr_tally_dist(s, distance, length, flush) \ + { uch len = (uch)(length); \ + ush dist = (ush)(distance); \ + s->d_buf[s->sym_next] = dist; \ + s->l_buf[s->sym_next++] = len; \ + dist--; \ + s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \ + s->dyn_dtree[d_code(dist)].Freq++; \ + flush = (s->sym_next == s->sym_end); \ + } +#else # define _tr_tally_lit(s, c, flush) \ { uch cc = (c); \ s->sym_buf[s->sym_next++] = 0; \ @@ -337,6 +367,7 @@ void ZLIB_INTERNAL _tr_stored_block(deflate_state *s, charf *buf, s->dyn_dtree[d_code(dist)].Freq++; \ flush = (s->sym_next == s->sym_end); \ } +#endif #else # define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c) # define _tr_tally_dist(s, distance, length, flush) \ diff --git a/third-party/zlib/doc/algorithm.txt b/third-party/zlib/doc/algorithm.txt index c97f4950..029e5a31 100644 --- a/third-party/zlib/doc/algorithm.txt +++ b/third-party/zlib/doc/algorithm.txt @@ -77,7 +77,7 @@ table took no time (and if you had infinite memory), then there would only be a first level table to cover all the way to the longest code. However, building the table ends up taking a lot longer for more bits since short codes are replicated many times in such a table. What inflate() does is -simply to make the number of bits in the first table a variable, and then +simply to make the number of bits in the first table a variable, and then to set that variable for the maximum speed. For inflate, which has 286 possible codes for the literal/length tree, the size diff --git a/third-party/zlib/examples/gzlog.c b/third-party/zlib/examples/gzlog.c index b977802d..da1b02e7 100644 --- a/third-party/zlib/examples/gzlog.c +++ b/third-party/zlib/examples/gzlog.c @@ -212,8 +212,8 @@ to the appropriate recovery below. If there is no foo.add file, provide a zero data length to the recovery. In that case, the append recovery restores the foo.gz to the previous compressed + uncompressed data state. - For the the compress recovery, a missing foo.add file results in foo.gz - being restored to the previous compressed-only data state. + For the compress recovery, a missing foo.add file results in foo.gz being + restored to the previous compressed-only data state. - Append recovery: - Pick up append at + step above - Compress recovery: diff --git a/third-party/zlib/examples/zran.c b/third-party/zlib/examples/zran.c index 32c93686..d3135955 100644 --- a/third-party/zlib/examples/zran.c +++ b/third-party/zlib/examples/zran.c @@ -267,7 +267,7 @@ static inline void append_bits(unsigned value, int bits, } } -// Insert enough bits in the form of empty deflate blocks in front of the the +// Insert enough bits in the form of empty deflate blocks in front of the // low bits bits of value, in order to bring the sequence to a byte boundary. // Then feed that to inflate(). This does what inflatePrime() does, except that // a negative value of bits is not supported. bits must be in 0..16. If the diff --git a/third-party/zlib/gzguts.h b/third-party/zlib/gzguts.h index f9375047..eba72085 100644 --- a/third-party/zlib/gzguts.h +++ b/third-party/zlib/gzguts.h @@ -1,5 +1,5 @@ /* gzguts.h -- zlib internal header definitions for gz* operations - * Copyright (C) 2004-2019 Mark Adler + * Copyright (C) 2004-2024 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -210,9 +210,5 @@ char ZLIB_INTERNAL *gz_strwinerror(DWORD error); /* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t value -- needed when comparing unsigned to z_off64_t, which is signed (possible z_off64_t types off_t, off64_t, and long are all signed) */ -#ifdef INT_MAX -# define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > INT_MAX) -#else unsigned ZLIB_INTERNAL gz_intmax(void); -# define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > gz_intmax()) -#endif +#define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > gz_intmax()) diff --git a/third-party/zlib/gzlib.c b/third-party/zlib/gzlib.c index 29fc4486..983153cc 100644 --- a/third-party/zlib/gzlib.c +++ b/third-party/zlib/gzlib.c @@ -1,5 +1,5 @@ /* gzlib.c -- zlib functions common to reading and writing gzip files - * Copyright (C) 2004-2019 Mark Adler + * Copyright (C) 2004-2024 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -563,20 +563,20 @@ void ZLIB_INTERNAL gz_error(gz_statep state, int err, const char *msg) { #endif } -#ifndef INT_MAX /* portably return maximum value for an int (when limits.h presumed not available) -- we need to do this to cover cases where 2's complement not used, since C standard permits 1's complement and sign-bit representations, otherwise we could just use ((unsigned)-1) >> 1 */ unsigned ZLIB_INTERNAL gz_intmax(void) { - unsigned p, q; - - p = 1; +#ifdef INT_MAX + return INT_MAX; +#else + unsigned p = 1, q; do { q = p; p <<= 1; p++; } while (p > q); return q >> 1; -} #endif +} diff --git a/third-party/zlib/inflate.c b/third-party/zlib/inflate.c index b0757a9b..94ecff01 100644 --- a/third-party/zlib/inflate.c +++ b/third-party/zlib/inflate.c @@ -1387,7 +1387,7 @@ int ZEXPORT inflateSync(z_streamp strm) { /* if first time, start search in bit buffer */ if (state->mode != SYNC) { state->mode = SYNC; - state->hold <<= state->bits & 7; + state->hold >>= state->bits & 7; state->bits -= state->bits & 7; len = 0; while (state->bits >= 8) { diff --git a/third-party/zlib/inftrees.c b/third-party/zlib/inftrees.c index 8a208c2d..98cfe164 100644 --- a/third-party/zlib/inftrees.c +++ b/third-party/zlib/inftrees.c @@ -1,5 +1,5 @@ /* inftrees.c -- generate Huffman trees for efficient decoding - * Copyright (C) 1995-2023 Mark Adler + * Copyright (C) 1995-2024 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -9,7 +9,7 @@ #define MAXBITS 15 const char inflate_copyright[] = - " inflate 1.3 Copyright 1995-2023 Mark Adler "; + " inflate 1.3.1 Copyright 1995-2024 Mark Adler "; /* If you use the zlib library in a product, an acknowledgment is welcome in the documentation of your product. If for some reason you cannot @@ -57,7 +57,7 @@ int ZLIB_INTERNAL inflate_table(codetype type, unsigned short FAR *lens, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; static const unsigned short lext[31] = { /* Length codes 257..285 extra */ 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, - 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 198, 203}; + 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 203, 77}; static const unsigned short dbase[32] = { /* Distance codes 0..29 base */ 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, diff --git a/third-party/zlib/inftrees.h b/third-party/zlib/inftrees.h index a10712d8..396f74b5 100644 --- a/third-party/zlib/inftrees.h +++ b/third-party/zlib/inftrees.h @@ -41,8 +41,8 @@ typedef struct { examples/enough.c found in the zlib distribution. The arguments to that program are the number of symbols, the initial root table size, and the maximum bit length of a code. "enough 286 9 15" for literal/length codes - returns returns 852, and "enough 30 6 15" for distance codes returns 592. - The initial root table size (9 or 6) is found in the fifth argument of the + returns 852, and "enough 30 6 15" for distance codes returns 592. The + initial root table size (9 or 6) is found in the fifth argument of the inflate_table() calls in inflate.c and infback.c. If the root table size is changed, then these maximum sizes would be need to be recalculated and updated. */ diff --git a/third-party/zlib/old/visual-basic.txt b/third-party/zlib/old/visual-basic.txt index 57efe581..3c8d2a42 100644 --- a/third-party/zlib/old/visual-basic.txt +++ b/third-party/zlib/old/visual-basic.txt @@ -115,7 +115,7 @@ SUCCESS Then ReDim Preserve bytaryCpr(lngCprSiz - 1) Open strCprPth For Binary Access Write As #1 Put #1, , bytaryCpr() - Put #1, , lngOriSiz 'Add the the original size value to the end + Put #1, , lngOriSiz 'Add the original size value to the end (last 4 bytes) Close #1 Else diff --git a/third-party/zlib/os400/README400 b/third-party/zlib/os400/README400 index 6dd41aa6..30ed5a12 100644 --- a/third-party/zlib/os400/README400 +++ b/third-party/zlib/os400/README400 @@ -1,4 +1,4 @@ - ZLIB version 1.3.0 for OS/400 installation instructions + ZLIB version 1.3.1 for OS/400 installation instructions 1) Download and unpack the zlib tarball to some IFS directory. (i.e.: /path/to/the/zlib/ifs/source/directory) diff --git a/third-party/zlib/os400/zlib.inc b/third-party/zlib/os400/zlib.inc index 0d9e2f20..744729ab 100644 --- a/third-party/zlib/os400/zlib.inc +++ b/third-party/zlib/os400/zlib.inc @@ -1,7 +1,7 @@ * ZLIB.INC - Interface to the general purpose compression library * * ILE RPG400 version by Patrick Monnerat, DATASPHERE. - * Version 1.3.0 + * Version 1.3.1 * * * WARNING: @@ -22,12 +22,12 @@ * * Versioning information. * - D ZLIB_VERSION C '1.3.0' + D ZLIB_VERSION C '1.3.1' D ZLIB_VERNUM C X'12a0' D ZLIB_VER_MAJOR C 1 D ZLIB_VER_MINOR C 3 D ZLIB_VER_REVISION... - D C 0 + D C 1 D ZLIB_VER_SUBREVISION... D C 0 * diff --git a/third-party/zlib/qnx/package.qpg b/third-party/zlib/qnx/package.qpg index d882af2b..4877e0ef 100644 --- a/third-party/zlib/qnx/package.qpg +++ b/third-party/zlib/qnx/package.qpg @@ -25,10 +25,10 @@ - - - - + + + + @@ -63,7 +63,7 @@ - 1.3.0 + 1.3.1 Medium Stable diff --git a/third-party/zlib/test/example.c b/third-party/zlib/test/example.c index 582a17a3..c3521dd5 100644 --- a/third-party/zlib/test/example.c +++ b/third-party/zlib/test/example.c @@ -36,12 +36,12 @@ static uLong dictId; /* Adler32 value of the dictionary */ #ifdef Z_SOLO -void *myalloc(void *q, unsigned n, unsigned m) { +static void *myalloc(void *q, unsigned n, unsigned m) { (void)q; return calloc(n, m); } -void myfree(void *q, void *p) { +static void myfree(void *q, void *p) { (void)q; free(p); } @@ -57,7 +57,7 @@ static free_func zfree = (free_func)0; /* =========================================================================== * Test compress() and uncompress() */ -void test_compress(Byte *compr, uLong comprLen, Byte *uncompr, +static void test_compress(Byte *compr, uLong comprLen, Byte *uncompr, uLong uncomprLen) { int err; uLong len = (uLong)strlen(hello)+1; @@ -81,7 +81,7 @@ void test_compress(Byte *compr, uLong comprLen, Byte *uncompr, /* =========================================================================== * Test read/write of .gz files */ -void test_gzio(const char *fname, Byte *uncompr, uLong uncomprLen) { +static void test_gzio(const char *fname, Byte *uncompr, uLong uncomprLen) { #ifdef NO_GZCOMPRESS fprintf(stderr, "NO_GZCOMPRESS -- gz* functions cannot compress\n"); #else @@ -163,7 +163,7 @@ void test_gzio(const char *fname, Byte *uncompr, uLong uncomprLen) { /* =========================================================================== * Test deflate() with small buffers */ -void test_deflate(Byte *compr, uLong comprLen) { +static void test_deflate(Byte *compr, uLong comprLen) { z_stream c_stream; /* compression stream */ int err; uLong len = (uLong)strlen(hello)+1; @@ -198,7 +198,7 @@ void test_deflate(Byte *compr, uLong comprLen) { /* =========================================================================== * Test inflate() with small buffers */ -void test_inflate(Byte *compr, uLong comprLen, Byte *uncompr, +static void test_inflate(Byte *compr, uLong comprLen, Byte *uncompr, uLong uncomprLen) { int err; z_stream d_stream; /* decompression stream */ @@ -237,7 +237,7 @@ void test_inflate(Byte *compr, uLong comprLen, Byte *uncompr, /* =========================================================================== * Test deflate() with large buffers and dynamic change of compression level */ -void test_large_deflate(Byte *compr, uLong comprLen, Byte *uncompr, +static void test_large_deflate(Byte *compr, uLong comprLen, Byte *uncompr, uLong uncomprLen) { z_stream c_stream; /* compression stream */ int err; @@ -290,7 +290,7 @@ void test_large_deflate(Byte *compr, uLong comprLen, Byte *uncompr, /* =========================================================================== * Test inflate() with large buffers */ -void test_large_inflate(Byte *compr, uLong comprLen, Byte *uncompr, +static void test_large_inflate(Byte *compr, uLong comprLen, Byte *uncompr, uLong uncomprLen) { int err; z_stream d_stream; /* decompression stream */ @@ -329,7 +329,7 @@ void test_large_inflate(Byte *compr, uLong comprLen, Byte *uncompr, /* =========================================================================== * Test deflate() with full flush */ -void test_flush(Byte *compr, uLong *comprLen) { +static void test_flush(Byte *compr, uLong *comprLen) { z_stream c_stream; /* compression stream */ int err; uInt len = (uInt)strlen(hello)+1; @@ -364,7 +364,8 @@ void test_flush(Byte *compr, uLong *comprLen) { /* =========================================================================== * Test inflateSync() */ -void test_sync(Byte *compr, uLong comprLen, Byte *uncompr, uLong uncomprLen) { +static void test_sync(Byte *compr, uLong comprLen, Byte *uncompr, + uLong uncomprLen) { int err; z_stream d_stream; /* decompression stream */ @@ -404,7 +405,7 @@ void test_sync(Byte *compr, uLong comprLen, Byte *uncompr, uLong uncomprLen) { /* =========================================================================== * Test deflate() with preset dictionary */ -void test_dict_deflate(Byte *compr, uLong comprLen) { +static void test_dict_deflate(Byte *compr, uLong comprLen) { z_stream c_stream; /* compression stream */ int err; @@ -438,7 +439,7 @@ void test_dict_deflate(Byte *compr, uLong comprLen) { /* =========================================================================== * Test inflate() with a preset dictionary */ -void test_dict_inflate(Byte *compr, uLong comprLen, Byte *uncompr, +static void test_dict_inflate(Byte *compr, uLong comprLen, Byte *uncompr, uLong uncomprLen) { int err; z_stream d_stream; /* decompression stream */ diff --git a/third-party/zlib/test/minigzip.c b/third-party/zlib/test/minigzip.c index 8a21ddfb..134e10e6 100644 --- a/third-party/zlib/test/minigzip.c +++ b/third-party/zlib/test/minigzip.c @@ -149,12 +149,12 @@ static void pwinerror (s) # include /* for unlink() */ #endif -void *myalloc(void *q, unsigned n, unsigned m) { +static void *myalloc(void *q, unsigned n, unsigned m) { (void)q; return calloc(n, m); } -void myfree(void *q, void *p) { +static void myfree(void *q, void *p) { (void)q; free(p); } @@ -167,7 +167,7 @@ typedef struct gzFile_s { z_stream strm; } *gzFile; -gzFile gz_open(const char *path, int fd, const char *mode) { +static gzFile gz_open(const char *path, int fd, const char *mode) { gzFile gz; int ret; @@ -201,15 +201,15 @@ gzFile gz_open(const char *path, int fd, const char *mode) { return gz; } -gzFile gzopen(const char *path, const char *mode) { +static gzFile gzopen(const char *path, const char *mode) { return gz_open(path, -1, mode); } -gzFile gzdopen(int fd, const char *mode) { +static gzFile gzdopen(int fd, const char *mode) { return gz_open(NULL, fd, mode); } -int gzwrite(gzFile gz, const void *buf, unsigned len) { +static int gzwrite(gzFile gz, const void *buf, unsigned len) { z_stream *strm; unsigned char out[BUFLEN]; @@ -227,7 +227,7 @@ int gzwrite(gzFile gz, const void *buf, unsigned len) { return len; } -int gzread(gzFile gz, void *buf, unsigned len) { +static int gzread(gzFile gz, void *buf, unsigned len) { int ret; unsigned got; unsigned char in[1]; @@ -258,7 +258,7 @@ int gzread(gzFile gz, void *buf, unsigned len) { return len - strm->avail_out; } -int gzclose(gzFile gz) { +static int gzclose(gzFile gz) { z_stream *strm; unsigned char out[BUFLEN]; @@ -283,7 +283,7 @@ int gzclose(gzFile gz) { return Z_OK; } -const char *gzerror(gzFile gz, int *err) { +static const char *gzerror(gzFile gz, int *err) { *err = gz->err; return gz->msg; } @@ -295,7 +295,7 @@ static char *prog; /* =========================================================================== * Display error message and exit */ -void error(const char *msg) { +static void error(const char *msg) { fprintf(stderr, "%s: %s\n", prog, msg); exit(1); } @@ -303,9 +303,9 @@ void error(const char *msg) { #ifdef USE_MMAP /* MMAP version, Miguel Albrecht */ /* Try compressing the input file at once using mmap. Return Z_OK if - * if success, Z_ERRNO otherwise. + * success, Z_ERRNO otherwise. */ -int gz_compress_mmap(FILE *in, gzFile out) { +static int gz_compress_mmap(FILE *in, gzFile out) { int len; int err; int ifd = fileno(in); @@ -338,7 +338,7 @@ int gz_compress_mmap(FILE *in, gzFile out) { * Compress input to output then close both files. */ -void gz_compress(FILE *in, gzFile out) { +static void gz_compress(FILE *in, gzFile out) { local char buf[BUFLEN]; int len; int err; @@ -366,7 +366,7 @@ void gz_compress(FILE *in, gzFile out) { /* =========================================================================== * Uncompress input to output then close both files. */ -void gz_uncompress(gzFile in, FILE *out) { +static void gz_uncompress(gzFile in, FILE *out) { local char buf[BUFLEN]; int len; int err; @@ -390,7 +390,7 @@ void gz_uncompress(gzFile in, FILE *out) { * Compress the given file: create a corresponding .gz file and remove the * original. */ -void file_compress(char *file, char *mode) { +static void file_compress(char *file, char *mode) { local char outfile[MAX_NAME_LEN]; FILE *in; gzFile out; @@ -426,7 +426,7 @@ void file_compress(char *file, char *mode) { /* =========================================================================== * Uncompress the given file and remove the original. */ -void file_uncompress(char *file) { +static void file_uncompress(char *file) { local char buf[MAX_NAME_LEN]; char *infile, *outfile; FILE *out; diff --git a/third-party/zlib/treebuild.xml b/third-party/zlib/treebuild.xml index 1d1b0077..930b00be 100644 --- a/third-party/zlib/treebuild.xml +++ b/third-party/zlib/treebuild.xml @@ -1,6 +1,6 @@ - - + + zip compression library diff --git a/third-party/zlib/trees.c b/third-party/zlib/trees.c index 8dbdc40b..6a523ef3 100644 --- a/third-party/zlib/trees.c +++ b/third-party/zlib/trees.c @@ -1,5 +1,5 @@ /* trees.c -- output deflated data using Huffman coding - * Copyright (C) 1995-2021 Jean-loup Gailly + * Copyright (C) 1995-2024 Jean-loup Gailly * detect_data_type() function provided freely by Cosmin Truta, 2006 * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -899,14 +899,19 @@ local void compress_block(deflate_state *s, const ct_data *ltree, const ct_data *dtree) { unsigned dist; /* distance of matched string */ int lc; /* match length or unmatched char (if dist == 0) */ - unsigned sx = 0; /* running index in sym_buf */ + unsigned sx = 0; /* running index in symbol buffers */ unsigned code; /* the code to send */ int extra; /* number of extra bits to send */ if (s->sym_next != 0) do { +#ifdef LIT_MEM + dist = s->d_buf[sx]; + lc = s->l_buf[sx++]; +#else dist = s->sym_buf[sx++] & 0xff; dist += (unsigned)(s->sym_buf[sx++] & 0xff) << 8; lc = s->sym_buf[sx++]; +#endif if (dist == 0) { send_code(s, lc, ltree); /* send a literal byte */ Tracecv(isgraph(lc), (stderr," '%c' ", lc)); @@ -931,8 +936,12 @@ local void compress_block(deflate_state *s, const ct_data *ltree, } } /* literal or match pair ? */ - /* Check that the overlay between pending_buf and sym_buf is ok: */ + /* Check for no overlay of pending_buf on needed symbols */ +#ifdef LIT_MEM + Assert(s->pending < 2 * (s->lit_bufsize + sx), "pendingBuf overflow"); +#else Assert(s->pending < s->lit_bufsize + sx, "pendingBuf overflow"); +#endif } while (sx < s->sym_next); @@ -1082,9 +1091,14 @@ void ZLIB_INTERNAL _tr_flush_block(deflate_state *s, charf *buf, * the current block must be flushed. */ int ZLIB_INTERNAL _tr_tally(deflate_state *s, unsigned dist, unsigned lc) { +#ifdef LIT_MEM + s->d_buf[s->sym_next] = (ush)dist; + s->l_buf[s->sym_next++] = (uch)lc; +#else s->sym_buf[s->sym_next++] = (uch)dist; s->sym_buf[s->sym_next++] = (uch)(dist >> 8); s->sym_buf[s->sym_next++] = (uch)lc; +#endif if (dist == 0) { /* lc is the unmatched char */ s->dyn_ltree[lc].Freq++; diff --git a/third-party/zlib/win32/DLL_FAQ.txt b/third-party/zlib/win32/DLL_FAQ.txt index 12c00901..d8cf5f31 100644 --- a/third-party/zlib/win32/DLL_FAQ.txt +++ b/third-party/zlib/win32/DLL_FAQ.txt @@ -3,7 +3,7 @@ This document describes the design, the rationale, and the usage -of the official DLL build of zlib, named ZLIB1.DLL. If you have +of the common DLL build of zlib, named ZLIB1.DLL. If you have general questions about zlib, you should see the file "FAQ" found in the zlib distribution, or at the following location: http://www.gzip.org/zlib/zlib_faq.html @@ -11,13 +11,9 @@ in the zlib distribution, or at the following location: 1. What is ZLIB1.DLL, and how can I get it? - - ZLIB1.DLL is the official build of zlib as a DLL. + - ZLIB1.DLL is the common build of zlib as a DLL. (Please remark the character '1' in the name.) - Pointers to a precompiled ZLIB1.DLL can be found in the zlib - web site at: - http://www.zlib.net/ - Applications that link to ZLIB1.DLL can rely on the following specification: @@ -379,18 +375,6 @@ in the zlib distribution, or at the following location: code. But you can make your own private DLL build, under a different file name, as suggested in the previous answer. - -17. I made my own ZLIB1.DLL build. Can I test it for compliance? - - - We prefer that you download the official DLL from the zlib - web site. If you need something peculiar from this DLL, you - can send your suggestion to the zlib mailing list. - - However, in case you do rebuild the DLL yourself, you can run - it with the test programs found in the DLL distribution. - Running these test programs is not a guarantee of compliance, - but a failure can imply a detected problem. - ** This document is written and maintained by diff --git a/third-party/zlib/win32/README-WIN32.txt b/third-party/zlib/win32/README-WIN32.txt index 384c988f..14e6398e 100644 --- a/third-party/zlib/win32/README-WIN32.txt +++ b/third-party/zlib/win32/README-WIN32.txt @@ -1,6 +1,6 @@ ZLIB DATA COMPRESSION LIBRARY -zlib 1.3.0 is a general purpose data compression library. All the code is +zlib 1.3.1 is a general purpose data compression library. All the code is thread safe. The data format used by the zlib library is described by RFCs (Request for Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt (zlib format), rfc1951.txt (deflate format) @@ -16,13 +16,13 @@ is http://zlib.net/ . Before reporting a problem, please check this site to verify that you have the latest version of zlib; otherwise get the latest version and check whether the problem still exists or not. -PLEASE read DLL_FAQ.txt, and the the zlib FAQ http://zlib.net/zlib_faq.html -before asking for help. +PLEASE read DLL_FAQ.txt, and the zlib FAQ http://zlib.net/zlib_faq.html before +asking for help. Manifest: -The package zlib-1.3.0-win32-x86.zip will contain the following files: +The package zlib-1.3.1-win32-x86.zip will contain the following files: README-WIN32.txt This document ChangeLog Changes since previous zlib packages diff --git a/third-party/zlib/zconf.h.cmakein b/third-party/zlib/zconf.h.cmakein index 310c4392..0abe3bc9 100644 --- a/third-party/zlib/zconf.h.cmakein +++ b/third-party/zlib/zconf.h.cmakein @@ -1,5 +1,5 @@ /* zconf.h -- configuration of the zlib compression library - * Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -302,14 +302,6 @@ # endif #endif -#ifndef Z_ARG /* function prototypes for stdarg */ -# if defined(STDC) || defined(Z_HAVE_STDARG_H) -# define Z_ARG(args) args -# else -# define Z_ARG(args) () -# endif -#endif - /* The following definitions for FAR are needed only for MSDOS mixed * model programming (small or medium model with some far allocations). * This was tested only with MSC; for other MSDOS compilers you may have diff --git a/third-party/zlib/zconf.h.in b/third-party/zlib/zconf.h.in index fb76ffe3..62adc8d8 100644 --- a/third-party/zlib/zconf.h.in +++ b/third-party/zlib/zconf.h.in @@ -1,5 +1,5 @@ /* zconf.h -- configuration of the zlib compression library - * Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -300,14 +300,6 @@ # endif #endif -#ifndef Z_ARG /* function prototypes for stdarg */ -# if defined(STDC) || defined(Z_HAVE_STDARG_H) -# define Z_ARG(args) args -# else -# define Z_ARG(args) () -# endif -#endif - /* The following definitions for FAR are needed only for MSDOS mixed * model programming (small or medium model with some far allocations). * This was tested only with MSC; for other MSDOS compilers you may have diff --git a/third-party/zlib/zconf.h.included b/third-party/zlib/zconf.h.included index fb76ffe3..62adc8d8 100644 --- a/third-party/zlib/zconf.h.included +++ b/third-party/zlib/zconf.h.included @@ -1,5 +1,5 @@ /* zconf.h -- configuration of the zlib compression library - * Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -300,14 +300,6 @@ # endif #endif -#ifndef Z_ARG /* function prototypes for stdarg */ -# if defined(STDC) || defined(Z_HAVE_STDARG_H) -# define Z_ARG(args) args -# else -# define Z_ARG(args) () -# endif -#endif - /* The following definitions for FAR are needed only for MSDOS mixed * model programming (small or medium model with some far allocations). * This was tested only with MSC; for other MSDOS compilers you may have diff --git a/third-party/zlib/zlib.3 b/third-party/zlib/zlib.3 index 4dd28967..c716020e 100644 --- a/third-party/zlib/zlib.3 +++ b/third-party/zlib/zlib.3 @@ -1,4 +1,4 @@ -.TH ZLIB 3 "18 Aug 2023" +.TH ZLIB 3 "22 Jan 2024" .SH NAME zlib \- compression/decompression library .SH SYNOPSIS @@ -105,9 +105,9 @@ before asking for help. Send questions and/or comments to zlib@gzip.org, or (for the Windows DLL version) to Gilles Vollant (info@winimage.com). .SH AUTHORS AND LICENSE -Version 1.3 +Version 1.3.1 .LP -Copyright (C) 1995-2023 Jean-loup Gailly and Mark Adler +Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler .LP This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages diff --git a/third-party/zlib/zlib.3.pdf b/third-party/zlib/zlib.3.pdf index da12d37183a7371513c0991fd14a7d932fd42e84..b224532bdd8eed1939b7dd04a7a0dd46be649c00 100644 GIT binary patch literal 25523 zcmcG01yoh*_Af0dQi8My(rh{eq`SMDO?P*9cejL;N=i4<-6JNhxU_1vvpwOI{v%K4C&? zTYdpW5k3J4roIN5H!=V1M&m)EcJo!-^1J5*#LFS;hoarRpxBwx>4Mx zl=|m<$gTZa@#+CPp}W&0bE?7d^19}E+*ZOwFyPpjeWxtMnK7RITeV<>Br|IXMK)({WgquWOfxbG8iHLJ`Y2SUeovx1a5$&c!Z_V>gaYC3?o&9`6pVy~58Ml#nxevW>}MPN-U>HI>{&CS{Q zW@UZgY-J$uLaM%Y`u$2C=3ECkL8R78I;ogphO8yJCB7s&Z#{uN`s(O-v+|g|8w0Jc zF{R}+dQyID<8O}{q@Xy^9SobW7{aVzxoxNEq}eS<#6J=5O`GE+mnx^0uCevaR)5Es zfXOt#Ybt~sY$StH+^Yqzkk)J5D>sQ}qTE5{H3wh(YSURRh@WbC;}*&}ExB((HPl-^ z$l&75Wd9u_-i*zB^O=v3hgtf0oBY&JYQ$cu3(R;O5mGt;y1X~U};mY3|1G< zZ`>E`ta2!jFW*LjRm3h-YnHJak~|dWAfd{2pUOrCetR}csEBilSFF>(=?ZUoP3?NL_uj(iRxWE3iB8_ z6n=~MogF6}#YS@&=^wdgl|w7z^yT+evh}{C-TBmo@&v=&-KB7d*QC~1=t4FmNE9C` z8w`jqWm~x$zDX|G_NV}61#fBgL7M*1Tw$ej(@codNPjtmR0}I;*I70w`js=HwllV^ zf-vXk8&Q~^?}n%$-42_%lSOf}q`KO6@D3`DYF{ z>GDj)9960K1+YLtzye!!)ciDQER-W~oXY4z7-gCN)0SLvwiP+*JJarjokCl{TeAM% zYW!?rpp;_ajzI!g)iOHevF*UAKr=Er44-5_3TY)5&q%A*Nk?!Gq{{;c?oym zpiSY=Mm*di-pR3C_T_bfsO$pUGp7&@aP%*qCaIK)&j=&pZhBq-qUMS|DqMTbFnU^PXc%Q)kh4O<23t%lOk8R3k@zj z7S`95ws@aj9EwIRuT`GWQJe2Hsh5djI}^!&7IlAdI-K1syqe?E(MT)-aq_XJi=ke% zBKet0J+MoV(^FntD)n|#zOx@e*!fD5iL(NM8HZ5NP>>F_Z{2?E)O7`1!5khr3J&FB zOopZgHfcX-$ji434m-gk*O`(+b%Kqedi2|b=y+Ff&Y>;=t#HOLXZ9j17b>?8sh(4$ zb!p2xT zbw_rhBvI%y>N(?OWwIz*Yvy{Y>!TQ8OBmrGH;T*OSrH9M0kxr8E)|X0$Ue)R(D~@t zM^`^zkb7}+czf&TMZ`ad)ag_%9ZMe9-UAt)goYnJ(9eImBA=8)_xhe$yErG>S*V;09&cdwgmzgc$7B*hDRUM2(hxEd z;|)cyE<0ph{-~H#&V--4SFxCA&vn*rQO0L8jqKH$!F5IjvaS(pAsL$lxdDBJ*G!ql z;>WSLgN;!!bkTcK!uzacwpgU7GKSyi5+T#Bb9ToIO~~-lnKMJ>;}yk)Lw2L=$O_Wp zz6E1Z<0u3%T)gs+eW%qJn55MBnbRxEQRv!cg}XpiBzTe^Jb=KGLCrbiEvxQ!H^jAA zzFqm5jL`7$`qf$GjFXO;o~zgMlEGJ=o`uTb-jM;2tSOU_fVE{Z_Vk@*O{s45)>A=? zX_ zjNh7w=ApCWU_Mek_;E}t@%jme!)-3EwZ*qXh z>8K4j0YYD_2zViU4B-~i?3;@c(IUI<<6zUwr}jJ>YigRCl`@Ft+w%Fm7%Qh&+5JFH=8+^T8NANpj$bvus1 zOHUn5;JiKGM*?L|x_4Rr$+g#Ou%fMdX;3Kj2M@n|c|9#BMM{FAmuQ8Jk_H!6d})A- z*R_r=T?3CN$@jB&xK%vbpI*PkB)Il5C}gQ|K7)XN5cJp|Niuc}D2(c@^ZyYjZ(aED zXe_nGjK(d?Vbrj&R{<;Hd4ZfM(a41T%rUQnsiq!PmH=OuW2Z5%L-Kpz8hc&IlR6Psou+b zHLMN0-Hq)!vssNAZCTrNrQmY0HZ5l~UAD8ccAU7%qPsbr9X9)QOc6coy*C7H*4?*F zcneQ`uoD#{a@iqSlq6P8yu7w8c^#iT5lW;iaUZ=&PvL|2&zD`ex(-f9`7FM_c8~AH zKbS)ZIq;|4Bp@-MDVC5Cp`?i17i=UCoLATMtVzCNC2Wh*X8=^UBGrVfa5Xf*YG!b)XlM~Fo-ZGdfoJ!1aWk<*DEAAha_ zW1?u%(`;q=W5a>k`SCWq3Q1|e3jxU4%o^eMt)j&M@8#(%DgQ)lkzV}x48_A-z7#TE zGkoNc03Y|z0lEj;Fz;5g{o3y6H{+p+p%q+*XPyh?E}K9L{a-3IDD|j@KPumkPkL5X zHuzs!IJF zjoq+s0rkbB!xjJJi-Hlvmy zYRvHDNo4dYegYE1&-BMXBxpyr*P0Tq5mCHBIHfhYHg38e%}` z54F7F947ttE>jcox&@{aqK9=RYa?wGEH+US zm?4k)$S^)d0hpDNR~MwM z%mke2&y!k2XJhZhW&GwpS>sV@6a==8f!|^U#gc(3PT|W@^1_=UPGg*OiRc|%El&N) zp$O|)L4y-!;AyFF><-UM)X0^Jv>Y7I}=fG zNs12rQP`MR7AuEZdczD_13Pm)0w%E@?&J46gJ8&F6_HhYnz)yPpX9IVcxTatMUb+o zEXy-8aNueCcmW}o-9`BT+u~Lzn|RVnZmAZ+(iW1Ht9Etbm(zsQE43CL-wS_Nal{8%h{p@*ufFd#SO zS9MJ2rkmoMlrg=Y9t#P?Es;BEjK-)z$8&Qv$}js$`W7xvDt0wX$yOynEi8o!PpqkxIbwE<+k3x_SI9^tppv-0oE64Pvi$XxlJs z1R9KIAl_b-fmb@hs_3b@06>RXM?Kx$*PNFQp4T*x?R zDicQBf!&^DoWfCTmtn&2*3^e4oFlV=9-D_Pv5M?iX=tv!MU#})J2Oz*ebeEkW_Lw~ zxBM1MsO0%VCDQoSt1X;jP%1!w;CUjRoMBFk|eLUxMe^ zXJMK@_JvG!27FF6R&s90Jn=SP?Ttj$1*fNW1LK0l_g#ix^kVWJDH3-P3VJh8@Hw&K z`_T!3T|)4i{~?>6gwCUIqGP6|{Vkjr|CUYi&Q?Hxw2mPVAOX}j*5S2uB2;^nQ_OVq zge+|I8t?#FOFNJ^iI5i5$_8j*N67Ff#X!YCTT6QzJ)kYf2_RK}EksB`c`lmpkkK^K$6ve$V~)a-Jmh zpX+(t;LqiN_5y;3@ChGVLV%dDzAd2|{3Cj@@N|!>mW9_K1dU`5+9~~?JNj#cUjy*z z*y)&A8vY*X7laZ(I~{#c4I%x{vHpPM5w>5D$g;pcfe4VLV}hr90uPjAfTw#J1(c+N zr+eBKK$Z>uQR|TXdz`2J{y!M!kLB>{*a9Dq79g$2ry$5fDGW4o0NNSr=}=4Qni*T$ z1D_6_A84y*V{Bz-X+!w5w)RRGs)33h z=llDQp5<{2T0$nKM@ofQ%h~JNJ<|H) ziPHUz&=a7)5c+Ft0%E+10y32H#^yj{T z|JoNl)8lF%@1Pz1$36VLo8NoH6L3cW+Sn{|(N6a~0q(oF!z0B_t$%;Vf%uu4C~(V18n@|4cm$ z^nbw2!tfhr_@6NU4FdxM1L(#6^Li|Otbfe^K8W9N{QL7Ka6j*lAp8vkEj$~DvmPOP zJf6qvDGg%7$D?^nKVDCBfK^dCeV^WQ-Fvj1n2Wo7%X6#EOeM{Ixo{R4!bsO%Rd{!Y_Ao&O^ZJTmj+SdSF< zo5Dcl@Q;)SYV~;fk97E}|Ib-LrGK2wKYgM8VfX&U=L&LCV#*4Ci;DjN+2e%&05Upy zy1y|4?dlggzn)KY{lqbUl|OOXZ%Y0B{HylQ{{MpTPgEY;J@N$T4#I}@=g9xk?mvtK zs{cLD&%VFg(EsA;pU+=yo<{r4zfb)5lcRpl{C|P|BPM@x$fIcbTNXdzE~#S<()j*& z!kZVQ1wJW(EFj&8g`Q30F`x*FgEE6w%?c0TvDJIj13?uarB26682HFaKeLZhJ=W2} z1AdB49t%S=knRW%kh24tD-zPtvcLl*jBRZ}68cHnKS@HsqcQ-}AISi`01<#VK$ZMy zp^q_9&|CV6o&ILZr~I!i{)~728SVs4^rxKX`yC%;WTbun{;xo&p1!t@uC~5D`myi; zxfpeixa9bmRBsOzl>~IK$!(Ou?d$eazJ%KP8^hxv^+^I74#WkH{sj)I6AyV6-lN>Ihp89-xeYi(s%%w0CweuWTNKykILy6)OX$6c182 znb<%Npj%>q@~E$0f`VERRO>@rgJUD0uMu*k~tYRgzYWOlr<}B3|30BDBE{Nu8KQ9t2F?*?&!BDACJ;oaq; z`$bnzeHfRR=X%J!O1C1g=bA5cZ|I(@i*i846IVOcHt{SHb1o8+E284qU2B5#pMiT#Yn&cguEz31D-D)+@g<}UYgJ?t`p3}Jfn!`oCx$ zRQe~aGcz;81Af!`W7q#m?o9tq?o1$FQ2xa-jEtZe{-S*ltpaoax&S?Zo~60D4gd%+ z1Q-E~0cHSmfCa!3UZ#RM?&sJyn$0Cw4^AH}Sv#H&@Ascy3??(23^)!t zH($7CUN~p)@vQs5Mu!LL$kHTItGi}xB9gM`S*%jKu|aF01Uw{Ti{%cz9oWxWE`f$s z=7vUCzfx>)vx93mhPR{gS?TQNfL}|mS&c7_#A7=$5bsY}fWFn`v2c)-IsNvP#VkS+ z5OIGHyUA&$W5En?!Ki&v!3OO{ld6QC*(ZSs_aPG9YUnV>lj}ZG?E&HZ!C&ZwKKW;D z7A-=mcW)%Pl3wc>Zv3#RF`XRrEdsSu(r&n7zr-y)oStud!#!JYlYn0XAzmD zZinVOwkHSUG~6)7*E(fj_n7q;d0k5C(L4?vMH5Y+>zaAv#;ue^MYrELxSNL8C@Z8d zgFI_Efo`t1xP6@waK(ZMw;{j+pY48g-B3bLuOv0oallLD^dR_ynlc652huoueZ%|h z$dxZ8t7bgd+;XgV?T#tUL$(b9mWJ$YiZqM9|+MUcD&!;`ZFF*29#H@#B zAP`<51tY$lL1CG^{P^8x%Tcc2bh*i-ZE$#{R(mVOk9PgA1F(~1G0Em(`4 z@~%NbykrU5h&Gc-bzb!gT<{wwwbAG4BEst#U`F1#MW60C)Mq)onHI5euoz?6=)4I$ z7o@xq5;gtV-4@*C_TGe3zn6wddz+ZAu4r)zSy@|`U!{UgMz)f8h){M~8OHA%K*YX{ z{Uc?sK2Eid21D+p=7!yDFSzQ)2s$NL)IobSN+TnE)2J41%hb5oofzTTPE!vM6d$e$ zdHtGsq-e_`b4&$}f+xI3>N$*%mKN}wI`dWYduN&XSo(mnn>A`eziQ~)jnB-6cU3+L z5|Qf5fmjkeQxu5#s9BJz_2U8_Dm2p5z&^+P+`S>>T#a`D8XQ}F;2fuXG^A%_UvFI5 zC?TppK3h6bgvV{sdo{O}UBJvv2rww@)S*o(eFsT-GM*iKrRptd31*@V{WA4cBj0xk zKPuJYo)SHi<5S!$o=$LQJuS7(9;Afw-BX@g_2)^de92ufUIW3LUKhp;r)*izV(uqwejIUVl+qZ^&xBI==||JW$5Qf(Q3CE9nJsBOkI@6ASG=1{M85EL?%I?3qqrF z?~DZY$G2G6!+OYEco!{hop3T`q1Oyptg5&tcOpqQf2EKnEk?~Z016#+j<<)XnQ#%% zhH^LUM@?&kw-{L`@N1NiRVaAbX36hJ7!kb`Feh0LtsT{6(#H1smQAjZV2rEkC5lcG z)28Yy)~9lDNjN#JI5p2XFHRM7oOT#j>4QKxO?!UGQa8RD`i#tJoRb1ERpT4h*P4EA zsLGdBP2IX*HcNlt36wbLWGAo(@v0-?21c-q=nc zef&7AzLKn;7$U5%wPE#{qyQ;yTluuHt}H4mN&MOB0GE173ysv7W(}_>ga?w{(!;6T zA&Y^Q-U}YKyMWeWNDgV;gw4y29Lyt`&R~~z86=7>vG%T|_aerF79Ji&LCJ5#Sj-@d zO0qbV_Yvb@Xlr1lK4Qa9MzAN_651^(dyTpgm^9_I6%1jC1TqK|fV&W1j4E9Wa^b#e zPf4jasRL6zm?#W;`9UOfEB&GvmBQK7pn9mEUyU;9AMmi`~_L!z(GsM+RGp_x{ zh1RRV@YyB3GZEzu+N56Hdl&V-r!2&@Tlt1CURrwYq(0&G3aYf%W09~xVg_oswGU6j z9{#g(j>n}G6UAC3%;!VRVOMTRIiOfsjfY4?fnb) zhjk&Yb`k~xPscZTq;I=LOQ}j=6pR$yhxNWf7W=MM%d1j?V_)^Hk~$W+8)Rlb$H^6X zPnsVjgox7v7lSbgZ(DUVHUIqmt9@nNZ?1E4G;Wj9+r^)N&Zc#0A|s)0FiNX^uhOg9 z;>;9M7%|cEZ24=!fT6s1kwVRPxHlg%9XwZ)GGd|5(=zv!3O!E7jzfC4rQDUu@=2@E zmT+e~E&Idf(1c1mi_$7*6%PRyB%XZgXQ3uY%6Rwbbe(iOsY9AjFH zD%3B%lX9IRRcfw1+j1UN;H1dwDPim>BPQZ=PC=)|U>KF#j17btV)0fXh&z-{5XiYj%Rw07|*!MgeKFlpl0KV|&)iJu;K-@b9i(PVSthh~)$4 zdM0JYee(c`OboEEBS7nIZ~TvbJGMbe!Y}yQ#ckF^?0h_+ElTQ!J?u~KyW01_&f<*M zL}`v>VUDo#SCi3u$2M_gEpGw?ERg518^mSHMoUSyKGBJO;pffoVeIn?;1Ah}e|7`- zx={PgCVqq{t6hK=>*Cr~s0DVG#HeRAnAm?*R-!aaAtANl%vnevr~2)y37eLWz>BStlM#s z8}IYB5_c#uskoyaz+gw6TK=S-_|au&7T$k9Vqvo4PFM2k^HX)@1q6qm(tGzGI{v``~U%fAV3Hp3^IR30b(FS zKms5MkOs&AK)x`5JU{`U2v7njgUkaRJCKE-YonuQ3bZo=8h~;?(@*ZKC)2^q(&Co^ zp${@444&Lu#txvA8NdJ(uz4~njGV2EfEFOz!USOYWM4dazCc!ng|P(?WNKJ~?jUpH zG2Zhu$y4g*6u)c`>p#3`KMfL~xv`$*kWeQ|7(8Ye4hk1`qEfpB+OY+C-ZhcBC&o;qx{_EW0XUOjUVEYWg|6`e zm6sGMFbzG%D&~@y#q@?M^P-WY-M#i;@uZR9pu%%{)^(_HGEFanZXGogiz`CAOgZ*k z=v%;Bz)?^nQbfhW!|T-!-SsfN_42RTHJ+dFte0thM4*y7U}C>14WJ~j(Y_yBzAB_O z$EM5ZRIQjRxO4{~XCl9(R&8K<|N0F%JqHnRv3HG9aR8ww>0-T?BXRgg_hF4Q!Xbfi zHr)|iS++H~>l-~j+1QQnM#l^xBPSDat5MIp-Ool3o=0##mu9wU2ML#@a5TYlG1a68 z!ZXQrlzSDYOA9$Kro!4?=_7ZMkZchgBM*a4!)6KgFjnz)R&RUi{13i*ej(@ns_bdi zGui^|Q!2SNn%H(Wgcw0Tu){(3S((qYNC~k63IR$cN^#Iahe=!fWzsZX$?J1e z)8H9DG`hg7U^sqAx@f0VWvLF-*kD#&NIL%{Q8S`^lrgwxZP7_QR>8?op-U+B*3iL6 z{A5Q_7+iza>Pt2CX2|EYN0i@KtiIXmvU$ePAHS`NE-DDv+^e*1^MO+$t&?&)nymgB z7Mm{>l6Go$+~2d_<0*NRGRknaQ0OTxCxMerlX73nSKj^|FCFi5zg0#w{pwpKj@Yzh zjtcWhKNU5DwLyyHGmOu%7h#$G2kCoE?5@>7hlgFticEHMJ5=_C^i`Guqy6#P=e8`| z?Qzkx(s1rKNna^d-YhcjzxK@`TLB+sU~%3een(F+7yh!l101^v_ImE4*Xc^WE?;38 zKPxP$RaX42ntYnJD3t7b;oknE$x$L@;c$aiBAQ(7H@Pxz4B=y+QM}rBo=oH$1_o)+ zp$RIBaQW6i*^Pcle`j4%SA#T6YO3b|CqtHZCI0q&=mW`NQ2a4VZ)g>%aT3oRmXpp& zYf}FLJCqc8-l7mX`9<_019@y~Nd;Kl2-OiL5ni5stl`z#h#ouktH7_T?~Y$%0zElJ=C;+O@q0eJyup?A z;yB?h$N;utrIs%dGrKm3qtbRP`pfJhqt5NCBaK7oyRoGzSBM6!EwGbF&6h9`u728X zH8ZsFv4TaT9p!dWNW?I~BhB-l55cTpO!dxpzgLsmz2aUqLWjCCdXDIoru8gS14Uon z%zEZk{mVlQZ$qH;#eFQ@cW~B>fO<_{?7?QIw|l~$vzTVFEpPgpT+k8pFiQL5D4!z- zYHx?CM34xg_xW$b+r&yL<~H)QW5j(RE2}Z#-9x{wa-;?}4vS%cp@1I+$?AheC5z2rK|ccLYuEgOv=YjPFNmy7oD>UV=tkgo_BRy>SaGiM%JBlrtxS_bw_$J+KT37}(s^NzNfi+3z zCK|wemU0sK7RB_vmsX3X_R`6<{e?0Q+jpdbE4@`EwQj9;&zw z;oDG3_u=`0YhcpAEiBjCw^(pX0#V)rlP#!Y4PV7)E=I6RvL?cEM3mO~rdh=PaqdL% zr6a>eV8PU3ke%}KdK^%b}?)KIR9>tt!DWxi-rL~noTRHhH zm`U7|iskr7W#J6jqoVucp<+yl#z~@8_?#2U+KQA^5XHI58W5YRrp|h5sz7}!w{1nP zW#l;3gJf))v;LMfs!lT*#LagVtkOYdUY~uC0)9#w9;}AR58}vn+-pg%=C31M0 z^}z6bhzPK~ClFf{H0Dz5W+}2hsGj*44r5$+{vK&{$!(j`Jh}1`?Utx1M^&+>Hgk!&lcC$+iG#y3I}C23ccXTPHpr=bwcP~4exNeRl8K<-@oH_A}@N=$nmP< z5c0#S`&S`5ro}7AC*$xd z!}MAdK#&=JhLlQE17~UOi2>i`Z3&h(IWou7&MoRWA?rq$nE|1OZkQxeT2hZy=O4!6 z2%g(R6wX6!rG5W5__nHM8N}s?u%Ne7l{|%%aCmel#I>;9jm&Geke9-Rt0}UDG+u&&CUyV>WehTri&XX*?#=pE3%}aw{}di=4H!YVUPKj%*<{zmub!ooP{& z%e>W!HPp(ij8W0*(M7ekiI(QAURS~3hC;^{__X3K-=B&+P&C>gtG(_EA6nSXtrq>+ zZ+kKc-S?nXTQ};4)Gea;U+#&$@+*B%8>M-DP|EbVkgD0!Ge)3`_kd>h;KNE zp2sdyN(K846}DckN#@mjND+Y@7JrE`j{Z%;dSn%~z1Y6y5Vz?8k%gw>;w6h5g`5~z zdJYxd3tj^FG#M5V6FGKYGYc=x@|o{Uoe7$1Y6>dppP5V~c*9lHV8!kDOzjVP0$2g9 zn()*!snK}oDJeAJP%<@v8qXj;KR;DSucfe}I$~DLyU;Xz{`h_c%CWO!Q_Sa*= zk4Hp86PkHtjh1+jHyt(%RF?7Hd=JzOMfKr2rPlB8O{cb;rOIk1}E zQp3>wJ=EC4w*WE0_iIIGuSflMy{J6Yp9_g&Y$9g87;aqO)m}Dwg~5P+hIkyX%z8nba7K4Smp;0g0sw)8) zYFkyrk$1>9?nkDaMKM5K#aG{EvOTqW>L*-yJ#XWz^OiU|K`AY33>zcd1E|9d(LwIY zphqlXOhj%lVn#*vDnHDPE-O|bC--AQ?mfHj#`5h6ubJV?#7fl=6C^kG@dLWFw~7mT z%qP#)2a%>qz+DUQ|&Khfxm7KiLulqhgaz5-fj6`-OhvPkJ)R&YiFFk=~S!W}n0w1c|oN*Tk zDL(LEvkwGT#Qpm0L1k0}7WzyLURz0%z{DCARgCUV^F;Gk==3aK3RyuvTRSLs?chyE zNdjL0D*C~W*ick5wd%5HjmvTvHuxHFC_~R+)=@@&MR1h6a<*E2^<34?A0bXOnKvUj z<6{uyDfVZ;G9&B#4_~1TN!-JiE1C&P26M4%+ESB3kQKNBZ{DBV_3-~l+>Q}%pJTj^ zz+R&#dOk}=bi-pTc>wNWH`F{Zi`y%y$ZEMJj%0k55qM-Pg}vo|`%lOLg{o%uLQ}XygPHzlbAw zN!eSbRQ&Pd7GlZwjqO+~GMOYNGE(*W2#1J!koL@!if)5>gQcvSmxg46;3V8WX9a&Y zTdHu@CZ|!N6_`D3QAW4G*-dA-vpAeg@A~1GiDFVOxpHjAP`)u=T@5+Ir$XSaN=cRd zvya-+$4d6>t;3e0ZzMJ9Ytx%GRygAMFq?LJpx0{M)oTikqOk(lZ>q;c4VuqBxP*n5 ze5;qft;voa;HTrT7=ocd5=Pp}ALrH7m1#f`n;RXV=~Lg~vlN0uiya#U9yh3as1}Jv z2vAKBaiug(kkY~XjVwk&dU{;u-fzC*aTIf=+=Q8Weri4_G!K!H_s`$ngY=)5|$neWW6zP4k?;@W!0VBKCBK0Pmg~S8V)uqQ`y1kVRtvWPS z78Lr{CIqTGi%swxGK>Ho-3EjzCU^(6ZED4Kp&7!}|=2HZfjPsk(-a;y!+fsSgva;DpV!K{A^0h=7U}D{E3#g1W5I zV+i#dH5*ji?f!t*9Iz^sV2dUzyRup&hm%@doSGtAw0g+uq&S!hm4hRXzIUOB-2%_KT!f7?>!rD2Q$?b>LF0i+hXF z41qhm11jGbqxcVw*=vBE87EdDnS}mkf+%Swqazdflr&ZERJff#Nd&(Usk^apNh7@b zTF~rS^#HT7fYxM7=LD8kFhsCd=dk;ctbuPBRU<`tJ)rAtgFX7xbB(U*hxX}nEndJn zp_2J@EN;r0@rL||!J@CTUp4dwJF3z}y~CYFj_u-kQ{Hrz=hFfef#bph%0065^l~3u zWe1T@6DFBp+0}{-BkS7UO2FzHwWkD~>F_%myE6uaGPl%`q^Q(n~jI7KF(=b~O^TAY$46WT=~e!7=&_e2ZsbkgaH9Lr+!rmB~SjBiF>b?yJqRMau{}YNm18Jz?O}$^UenQ zY@;J**1-b`_3z#zJ}|Z5AC(vT^Ygz~R2w@x82K0g!7iVjTm}bTug46Ixulh9#qmL5 z2Kv+3;r(>bNuGPh7P^eocSJ50^2)1ZhIud*WcK}2Sd++dr2<*|Y!ayJk{iGaNs$d#@>Se_kJ?bKo=W4S>=Z=%C z-h(}(r)BUZh$B-i8|;X?L_Df9$Nz&FzA996q)U_>Qi+?iiiyPpj<*#lyI?0V&+oR324)GVbfM3PKL^WfUM%XDUb6QeG+$j-O%d906g?z%$LdH z_DpwccwYD)BU-{|2(yKHm(*fHUX(T z-R@GvBQcnpON_9&Vp9l=3z*9|+|)4}E^>@}FY3J|r?lzwh8_~ggDyBZqQ*wCS2Z8l zRE201oZCxQ?Je;w7M!v!`GH`>q`Val#Ymq}fYMk{~Hu!^HH!1V=CV6%J~b5hMt z>gj2RAqhDg95C3o48@4nSY-ol!z;J%4iS7J)OS4y&pV9WkrDWkbo7uOciHE3-ptv7 zSM(zz?l2A{e%n`?1dYNX%mk%Ll@zeMfwBps#usmdBEFaUnoTlv8?>{D6S6RqlfU~! zQ&^bSR9#C__Fge)`10ZCm4~zf`8}S=>p5^P@}1FwM%riSlf9RsL%}d(Pv^p#&emhBrN{4m%EUj9!nGb_L>@L#} zt!Ar>7VHifT^txAwq!(XB=mKvgnj>cu!!pmY`4!(G&gF+6&g~Q5Ob600 zXqF+={o%eUizA3HEn-jXoSMz7n6C||3ok;uGbS&F4lD?GgW9f+Ha$4!C*>RcSd)cbt5^_S3dbW!1 zr?0UxuGnPW82edHN|u=5jOepsSh&wzjdhB(#gbYwaey&`K1j(a-1LS16zV`?prR&T z2azu0gqFfOCn8{ElG%X@e#t|~i;J?g8ZH84*sB??>U#LmaulGq2ho+?;`6BnO|sc7 zTx`I9a8rX7a*xM*Q?C;7D(lV)!-W99*0o%JDgK3fFRLvM(nQlFwS<$4 zx`~gRvgpo?Ek?m?y?=NqUVN3qjGK4DRCv;kvZrfDfT3W%?T++iGfn$p-GqFuq)BJR zcsGUUlu23TmS*M-o#ofoH})l30ZUD<9S|x~T%U0gG1a4&->J?Iz}2iBEqVFFdz|^W zKqF$OW1joOIn+}aMqYDSO2-*ljt+}@>uOKT*cm$22DJBXHoRyNqP!L{SMrD~E6HvC zdWB^Kg;8DytHv9=Ft4c6XL5aq&1_~Tb&8DnEugzyerIz}`2o=Zi@jwHtA%HX&g%hZ z8Y)6z#1ou#vl;U8toloR#vS+A(&5|-yLR~NlK88n8@4(D6J|1uoBP;-uuoo@W}Kt*a8{iosQ~h)G;4z)5f`VqOgp+AQLWYe25Et zm$Yqeqo)Zwqvs=yFj4PW(Q2b-8D_tm3_b_*U?LkV6wboaTQMh@VUqN3$zphUn@G_r zMNl?3LS=!9NKhNr7ON?}H!B-VlXgEvA&NttC}h0H!gsJ!g8vbC(L(fza$K`(lSMm3 zd!cHq{Z#9Llzp0-?EpQrM}T9$P$r(hBTGP)ji?1_EY#JCG?@OrD&01)lE8B!l6Abo z#4~_|t0EgtYaZFBQ)hWUBRdFtXV}_2=>7f-hSR$%R92+xK6ZAR^n(CKuUKQO2Hd7_ z1Gr<yh~CJh&Pd~)&M^}8f|@S4@=_xhhPSs*!X!cdg@uU^ZE77 zk-;_GlI=-Vw2T+U{V_(b@zt0e`b!o5+ZGc zYw+vbZpt@MoZ24FV!ICZFGM(T-v&j*@f+637dkh39vC$uY<>nfm8f`H&7{QhR^rv# z>i3(~1EL-1ziU~0X>lMm2Bwp!LuA?le6b}Y&d=g3q`suzPupIsU)sr;CxJyC=8NxU z6OUE~-H+EtW=vkBzra5zu7|SlWmsvwOsF)Gv5rEjA7Jm=sT&-)?>)PfZEoq`wBj*q zASO4@WJ;X6SyJAy=l+fjJBvpXEG&xtZAFYdM$6Tu<`CwDJVQ5!r2pG)We4gS1E0i_`pdS@Z7e+U1)?Yfg9#hDa_O&J?N3D z!gX}+E-|1;wr@{l#&=UzcU7ldr8mu!;d^@#k5#RmuqbC{ddo(zFFi5 zii{#rQw?p^n^V?it+p$pUPU(jm?zfh=w?qNAC>)94~tJhQcwEcvv`iycy#PmBo=PP z6EbnvIPyP)i61{B{wHDL|NMRbQ?Gme2UbQs7O{B36AA1ljG!M<{%<6jAW3TA|;FO^Xo15-0fyRR2H) zSAj&7ajHQb&*9bxvCW^TQ=k^!OB)QUDwt$j#VxHqX>>a2YS1=(=Q)^sa_`jVesJUd z-Lf_k*wBcT0CZfcWQfshxBZHH} zVi6L&&_EKTlh4qspFq!$l-y7CHYvHiVZe{JS$CZQA;~exG1lkH6u>`%uC}Hg?XXo7Zoia8BXAkw#r)(B!ItpJ%k5A|;Ufsxk6VE3xdioS{ zJDQ+13!HhCyb?aVyNG(1J6%~p^}kxX@_4A)_TM68SGG`1wh?AIGh-QM>|6Goio%R# zvNi^jHA)EC)gwZRvXhW4p+}_bNhM1{S*u63=yyiTQ|f)6=lA)%=dUyOxt9A}_jS)) z=X|d3cZ?V$_PG^2@({ycTd-XmQ7T_xP*$7>C>D(wOAPQjoO-?cL`7ZPa-FvmsfGqjh}hM;r^8CN1nF{@kjtifKFZofpLsour|)K8SR+A`NG(0jg`Ss8 z@3Ea|7(nFUGCl)=hAPExsygx%Vk!!s7HXxqX?m&eILE(a9h8=c4+XZkh3m0l^2I@1MwX?8j zU-slq)~&s~(S6<)m#@P+d(Hb)kIau$oV&$Tui4h)L3u!W)1}aDhY#dXb`pNrmeB3+ zdbGgsRQa?mDbayNjVsius_RMLz;ppKm+=Cxb@4(O?*Wt1)SVy3F3S+dXF6TZqb176 z0(bT0?s;M@6KHV7PP=;Rv0?x9G!^^}J|o{;tfl?Fc5E7OyP*7Z5~rzKkg49{h4FN{ z9vcc9Yt~N8PRFi4;#ekq_G@2>VIP3jdX^d5a$;1fn%Bi%j6j$jPj5?r#7k%x$LOp? zUZ^=XKq?pr8@#TaoVY)KOVm8{)Wz)AXX17D3hgqfLVEf7eR^Cs#rV?A zeXoh18ScR8#J~dyPiIZqH1t0XRK&QH^_kV?ZJRo)lpUmsfun(Nz-@F;#+LUwtlRAXUwdv*A{sdT^Q3;vScsMI>wD_u9-U-RAdlXESY%` z&l-hXW>iF+@2X%;P;3{k7-8c*puV>PYZen0txXWPAXag!q>rEDXw<5@KdpXge3 zW~auodB2c9eQL!j+}%n4mr>DI^0Z!(H|*5V2(I^FAVY}94X)d+p5RqOCsi~TFI z%O~jx)ay&{yfE;1-M6;E^O!02Ff!}K(psb>3=N1rDM=f4j^$G2DI2V;RGc?kITr37 ziFy)kS>}#TR*zt1B0QLLXwH`jl7k9{Ehk(&P>U-!1_9bUSL0`{3@VyP1|18Zg!}Rg ziV_cz>MjV}oJMk%Ch;83eso_FxBHJ%<>dx*B`=#)=|klgh2LG`k z?&!_xB5Q4pZ1`)p`d!|}YZ&tutakW2RTdwYIPV_CTXJ2&FM6TXZ!+u!!z!&^#xP;@ zJp*PiF_KI=%uyW|rG=W#x`(_tYjFfUHtc=IAhQ(%q?U1+mT$E(QhDOtFrV6OxFues zA~sNBg>bMSDFUW6cIS1>ssL;esV;o!$g6_A(@Srght1!M(&OfMY;rOQR|;F)%L^N4 zor#0CZ9_KZ`Z_RC9u|uc-BXrYx^u_<%#$J|EGA~n$G9Dnv^YMUVE;faYlyU+I$$TazRoWc5BtIJ@CG`iP6xpMMGyhKea&v;1L+2rJ+`K39|SasNC zm@wby1&H?5%4=~AclwzP$F9{^%qRy2n)>fGocGOL)I08>_gKnkoag$*@FS%eo`=Uy zw(YEI3N{kCx-CHLVG5(4ie&h-Yq?VH)fbSQf+gj9DpEB=oEe8RaLEQo%R<|D*t9|n zqu<|Z_t;hK)%v8NPr*!5W@Z?X$BoU^uhRhEXeAJ=0}a*G6iARqec=xgJn8WLB7C74 zid#OH4icseB;HJSU0++q-Ql3ly!hj}+QeT6bGaUu`@|6Te&qXXe&Q|uCTa9_)Ct{G z*kWmCjIDj2X8^;(y%ssM62_Z32<@V(i<=H>uabI|iKNH1q0v8C1u)8X0^%FgLDqTb@kOB5&sWo5BwZ zJ2VpK=*}(kzUjf*{iZCU!PEXtS@c^7-?yrW);T9DL&LCQzm-KlfM@<2P>j}e_n(R) z0PMc|xhMhwi{HUB(zMRH?|pQdVBZ|AD*6JT*=Tg5A!lf~nT=+)-?g%B;Ano+pa%A% zZSF?f0DJ#XO3^-ljoSaCl#)fs{#Z(xeSy`qB&K=k0Lr7luxP6?FQ+&;-$=6HwTUYo z*I9`OcaIV>2{F%FEDdgCy7G(ADnl0%Gz5q z*BW5w%RV{1J`_AO6L@#x)XRy$L#pMWkVD<>{+h>5o`X-=zU_XTcwG1Wl*Rm%_}SY} zbn-RZiz}gV_^@e>n5j$72VZ!GX0v%V#Aj-w#0z?bfuIge1Y&OfQ`1UlT7heN+fYBk zC^^TnuGUzC$TdzRna?&}l@msME)i49k9yQ~jhj2qC-(&NXWpgH*M%4x4_`h$Z;(}_ zou0~5sKl3;I8oZw-=PFC(%?{tv}p!u72}mfcFl1fQ|IF}s5KDT&3TqROUypPl2M^C zGHw^jp8FoB$cT`$YQ)*8Tk42+XQQE<%eFgijrMReC++3dvdId{R4zB+Iv_8`&#@+D zz_A$hStGGiXrv}SK6Zbbj1GgADs<%2S!*A!xZ(-dg~ZOs4_4L(6yyenf{;f-KI8A@ zU)tkcT%$TJ5SoBlZL(q(Zl2Pl&T*D`(^or*SG84=p9(i{YYl5h3Aob5wshDD+!$Kg z;V&tvdMCFoiz@9C%{(}^TWK3AE)b2%O=>9LGJ4}(w6Q$oblk&w0czy1gz47qn!1Lh zTc*+NmEE3sqhrt1lTvCBXOn0A!~)nKFb2M|<%2p+h(FzGDkv5GY6o&Y@8o4WW3-m; z(tBPG1*c5PxzWDbvkHs4y_Hl^`%N!@;o`Ojhv^)*}5uE@e?_2(p$?|PW9p!O$c$XSLyElBKh(`o2=h+-3k zksvddPv=FtHIWC5>#5Sm_ea<84eU3o5#BzezxFmH)G5?$pPuc|YzB8(1ShfDo!_?U zlIVHI{QUR=e|jytJvoK^k1ZZ$R`rDIF#WZR75yV3MsF_7#K&*Wwj}+OfLw?xh3gtq z{j2*!wR4OEirX)WiLW#7w)CrDNmnWOXf-1(8qm_N;cW-UREXnsjj2o5#;5oBB-GDO z9DU^2yuL8?NyLHE3$7cC%j0?;cmweezt3O9L+QMI-Ky{Uv!TJKAE5VJBKs-$Nv3M6 z0ZPdIS&sWzpG-w}RHxOJ6Ivt8oysXgE^euH*8;3fE&C{uHmXDe zI;C4U?np07aK{6^(&OGP;ZH*Cd73%svo2>49F|@-)pRF!wx3ogni!w;xM#4`cL`R% z{*Exbg>5JrZvj6Q_@a%k6;Qca8Rn`O#ji*be4Zi|di&KAbIk-_p1gCxgb#nZP(4bO zSmP`Hd||7W^D-{Rnq@LuCy=`)=B8w!q>FOD@dqgD#o|DV=bm@=vRFiZ2u^<@^i+zM zr>aZ=>K@dY=5XGfZFlYX+b9h=mhkqx2JSS?a69wcs0bOlx(xo1r}{gah8f@Q`!GbW z(A}7qylX1))O-?eq(@ZM&t&gRDLtPF0D6y{Nr6}>Q_u1Fm*`WvLd5DZhdZp40jwVKrbqX^X zFggj8;P~@7KdL?byw#$Efc1}M+fqq__5I8H z7iT|3?8{yatGjtT=jAgm2Zi@)&%JsGOeh0EImu}GmD)G2##1{46xOMA0;Pn-SE{3I zq8d&IWku~{AI)i{H?G42_SL!CyhI7TPQEqQ-sZ5~lP>2@6h&ZdQGsDw;O<=kcvHQ* zS1g!>MT_LK;gv8lWEE;xP^EOcVFfy1bJ;`O!E~+P!-utSYYVK23iDZ=Il8n!4ht)q zyix5}dT+_QWnz)TWma$JKz@fV!kFhuJ1h#{0#6Lr&lZW^`Kbkx)-=My7_9iVZQ{lr zoa=MwN4w!!@o;8mu#JWB7%6^|e|hCdL~-j{QV>ygF>krHZcY96noj5v`$wC&!e1+i z4Gx;iid8f$b;v!VFVPiAwqG%L#_0UAQTS|R~HgU;LyaxVh0%izX+p?m_T&11S*>spFyE-Xx+tfRrW@0KgvR2*`p- z1~_72NbrE6?6D{Z85tDT!R}wL2$Ar89-iK0*Nx;O5)clq4m6iu;GzKFM2G}=M-Q@_ z9R*B(Pfw7`XoKD!0p? zDw?}*NE5%r&XuJ28ieD}j_@q*gNn>b?TR)G4k-tqewm*%O7<5Y7%cV| zIWz!17jLqM+2E4-*@rY5^NU|unP2B8_iOwB07Rp0@&RB0S&)h6Cm$$4T6(h_Qbt;K zvtJY%E&IcoQ^0a3{FC1PQ@{li(WMCBfy9?0xS!XW#qY zn~$}oS9NvQ-_^6|>gvC72(mr{GVugdqQC6EJd7DMuk&>|RBOwYwe7U0icNorb@97= zl@jfYHWCdbCJ4i-{13k)1MqcF! zA$5XH-elW73s6Mc;L_dA+YP+IhKFu*Z>5YSmJFbYJt0KrQyShnV7@*2www;S3F*p9 zs&sG9V|fn#(wI1kBCK0Axf>c$?$5Rk4)Z6PWuEwIQA+kSGhbQbc$n5T=@3Z&euL@D zFM$E=AY;r{(jFf-aL7B(*s0-mS@X;YOIC|V=G*hFr;^|iGH9}|?F83P+NlbcYz_7; zok7zrrhS7Q=g-oN0foFpZkH>dXgcjWg!P;OJ`Jm(6BFUUuAZQPhisNUBLkd!%Dvt4 z4NfoR4vmNjalEPVU13EvB52%UK>Rb zS&bV}zirn2%9yHH@8-O5KS@FbOVv{V_ovF7A(@>xja=(BJrk3)o(?}yK)3J1#X#eb zk>h*iv5$!p{$2~yLxLqhn|?tbv83}<*b&5R+LZPL0;>Xl{T3Sf1SzABwBqH48*7kr zjBF4S-a>7nmYfPwc*mJ76h~cSBbfYCOoAwOM1zxhSknmNh-mP!cNnxCu1On|f#8d1 z(!SESPv2o7i#o@9hUbex<65^65Q6>a9AZB>8%88c?&^5y@9KONixybPnw&nd9LY4t zgi9)6rDJELA6_|wed;X4mRM`&?}AUTD8c9IR6Reg6w&tTj)dACPnXS=E`u&s{Mv9h zlfQtoV-|ACeocDVgM*tX{S;aTThvPBGe6RXC=_x6S&`7%&pZ!6kTkO-8d~FwG8Q8} z2b~S9b6+;ZQ{CiXLRu~Y1ej?Vh@qCtsoz16^>M&ItQuR!AYH*4u?;F;;+Q>mIObvB z6xbjcqPjqp%fosxvIq|=D#TIeM%ci;L(a4nJ^+ZWsuR6VK6#5>tYBRO5;JA{qW5V?S2i z9WFg6mqG`55CokaqztaJ#<6VqKv9NqRFr(!hkTG~3WOG&SKpTm#-)#!PZnY^9S7t* zFk4$L^>gQF^^ai|6k6>&+Z%P{SnzX_jv$k-N;Oa%5}G8~{q5h}ykXnyP?t4RrC9k= z^))3JLgM5dy{}X|x1Fj(GJN(gwAS!C_$3r^OTsk~K+d!KU00U?MeK(Sv{|$nhNAb5ZQwJWTY)PU{2t|mO-fuZgW08MS*GYV0NhuHwxZO4e z;^`cjJy+t6H+sY8i%#L8b5tcEs012ri}vp`$77==Fr7=`9C+$4#64~*LWvnD=2p>Q z?xtUqfbPhcJeDzXQX2iD$PgOy9T2s#o3IwoMrcrz#TGYRUew+Mb76fsK+A=a#)^G6 zM#i$+KYo5QGZY=|i38=SeO-4kqC>^FHesOHafoosP$ zeG0!SKvp?AC{i0}HAGoxO-Jjs(k3c&^ZT~jYd{#k*JXExc*KInDk5ZzFsEQhoK(p9 z!3`>1Ih*eNw-jAyM~Dr05H_YEImIb1F0@WGoA}h?bRn5zLVG za%Jr!W+Yw>(VaG$f7+$rC#J@p=apDLl~-x)f{o1R_esK>*&!cV14lALbr4v?5MGYA z$_Bw71}7+1G_j*5otHXIlAf-7v@}@HTm{O)t<$MyQLmKvqLPV%_KOQ8&!ODh7Nb-s z^=Ie%di}{_S$K(Jts{rGdEZ@BBg3ctLY!?V2GzefHvEkFyj;3o>v>D_8LG{ZLClWi zz>_T%CQOkTXTFpYy=QlsRGp};VZM)0AxC#)aqlUEe`o(15pJj%d-5Hj5_njyqwoa8 z8u#WcwA1L<7@Q7v8i#HIXu}UmMG)#BB0Sv{1-E8?PKFY53p=YJd6!;ES79S^&6Xll zK_y2dDTQ2XTWVx*zUZkvYFX$h>JP{Ma(y@4noK=(hg!uBXB$BV_1$Dzh8{GG5J!?^ z<61A^K@@iC&@4msSWx{T5jInFjQKu@09W0Y(QDrNN-=K(_xzArx2j@*A_|g48bc(H z0o=f_Az?P6%NFoW@O^bceq}PW(ntT4T0q%V2Qw!*u_tw;g;&vRkZ77{hk{F>YsW8} z==p*yBqVr4{^zr?3XMIx{L}B!4TX{$Z~d0mrz z@I1wj0<$|7oLOGB@&=f&RR?R(r+nx3GBwqYQ%jD-4kE|inP9y=E`t2DHy$2MQcPXWL;E}u_FSH;(cTXk9yLmM{+=lknRIa3rJj}K#DMb zDwT(mk-htkZc`W%aK{tzq;d>7%LmVi8u6<#PV7!dms=ks&WKrc13NAZo`&y-NRlDN z0h;aBFOiD8@eW!$TrK2QPa8giByY*|^Eo+T0tDGGZG0&2&tw-&33PAa3BZl7nM z#;~{mm+1#@%2m&0`98^CoTXE7MJXY+rfUqVpGm`=;R{SUjHFYdanmDZJf_{u5FxEk zOGF2t_n}6Wn7`81=Mk_|-#!su zo$S#1xp_TYd?EpmOXR5JpBj(|;$lw_Yr55u`0%gU!ZxmarSUX~EOYcnX^={vx_^S{ zilgD`HZ=0Y_m=1GLb;W*m|rPRYC4q+*V)ZEZULR!P35XKnk54KHlT_8MobDVn4| z{S=>p#xUZ(z*x%(GBw{yVeO5#Ol8li(C86nR)=(>UI*shB|LcH`mlek0Oq7qgFK+` z;HG$@GVgJmHqA%o=iRQ_c{QlTd@Z?8uwZ7IE(@2RfMf?K<&*l zl)=L{U`%N&{5=x1!ZtoNFBZL0AG=2q6q(%2rf>oXF-XeVd7I8TLK4Nu;CKZ^Dog2! z#IG@VgP!d!T*jWa-;Z(GyeUfY5p(?RK;yCrlO;ttn%H%HcZ-Jv(} z*hhUJx0|=^MC5Xw%V|Q?fqN_VbtcA%CXr++w=a^=AEduoB!+iNpFRl&u<%;fksJ79W3evKz z>^)IqDh@9i0&c~M-d0Ys;@f=c&jUffS^dav277_g!Ec+F}FwJrT|d<{-)&M z<_7$i5XkknR8Wvj(aPDz)0P6r4R%JM0eR^zCoi}C-lcAgTc*CFg)nYp%l*Y4?PEh~ zQ>9eGJ^?M$Z)>Yj{B?}slE?9xuM85Zc)M*^j0nrtGfz0EqkxThD^%(yPmnWuWz>m| ziFUuB)$Gu9Z}3u8aNF2^->#b#K608|+8}V5M4Jjv&?0M=@pk@XIZqhw&|=sbR2j*- z5~v?dwOi2}G49-)CO9$P(O_Y7#l0h?uUJccbvS9!NLtZyu;^kX;V=W!No6+wRVLnQ z{L!t+ne-z3tIMVXq={crVyDB062XK;{Sr%xw!0iWnI>f#!%{|r+@8Cm^;6-xEFY1^ z<<{Euu7`8$2Ej!yg5f9|MlDAcP%-cxU5FT0PehUDhBV)IY?4SMq@zLuajl^w#34U3SH zV7L(u?9gHNTpF-Po{M#@1zG1%r+$`@L%BQI*y1t*>(t0i-){M#-nuXvs0@Qbn(9rW z`uO?zLi6*qNAs+}b>Z>6*2@;HfYyM90Gcb)Xxp>1S`xWJsQj&Y;w$yFVDNL zTG|y3B|B=5c6VJkP$s$edpj0O14zRMTI!Sz$+9$3uII2Zqbjk|^*)A{yO z$y1|%ta{Skg!*ha5+tq~8K`L~1CrEIoXP4v+hP;Dudy$hixf>6{`lLeqfAtl%+w!C zqLu8B!TwNEc-kAYzmCNk>5a~GjZ)DI;zIQsho;(46fv*E^HkoDz?sn9dMn(f?BQVW zr39>UOqs=7;`?F2YngBbTrV`k|5#pd_%6J4MAe$2eZRr#j?LQ@5e;(e`*Fj4i%8&` z!eYNs$Nat@I_%qwWz(Tj>c+y8rgJFX5HMZ%Z9Y?XDoGMPgB?5@1EA?^1uU)2QnsHJ z;>3xRv-%#?Or;&$yPi1WX|hitvzM#_oi7-<$mu_>+@b@v&@*UVju+}rP!;XB zOu0SIt+-@--YGX$50Qds?U%{WI+nFDdM*)mRTTUEW{{^_IG5U){b+sJ>3>+YD@I`8 zZz1zCr~JxSGY(TrR6eT+&ajo>)kH7Ipdf@-8Y2^^KDwf10>|M$PX!}_8s35eZ;M-@ zbL3F6ZOO{S4CbBoef`d)x2%=LGmMn;seG#@JN zBF?XU7!3`xZr$2_0C367WRm+h}flG}Fl$ZTi31Gc|4V~sL_R(PqZQHhQr?GO1wPb>-X zM-gXj42}(_Q+~#uACIcO$&63VKihlEpWIVpwW0q8Gj@GY@nd3^C2{SjKS+=hcRjh+C9iEQ{!7*Jfe{>| zpE*6XFn_lD$N`vGl3nOrlYBYoXTPZ`*|F2I@S1Ug`G>k;q#tQckqan%_o?3Y#96$- zV*6_(*!YsiXg|<$#FbohMx`dc(7M8G3Kih0vTetI{}|JDAE3#6D>{9h$w-L_kneAzMo0>ftFg^WNsiSk;DJC1sT|pv>~H z+5=hZ=Sx05pPq=QHGRkMg6L2qcazK57nKO4InGB2Iu9GXW5uBw$PL}6z?nnyp?>Da zVQ(-wvrgX7E^XlT692SZRF=to952DPHAo`3K2f5ZqSY&6=*-ksYw_&dPJb1^plJqj z&L0)?Je5KGTH2Nuc%y>|5b$X{a)beIxhN65Yt<^+V=_FBJtMohn1H*jC`a!q&%F)K zG_$}I1fe*IOU9Jy=7Yb;I``O)C591mykr)i60ROdtQ=qTfkiXNUAQYh{Wt@u#rZ~p ztI3IAfPB21;8QFQNDi<$HZ}MfD;NsM1%}1phU5eb;wZqezlH+pNMI9uJa9h_HX@rk z4-(IB5jY+PA3T7A#roe8Vd4LdAAtXgAH>Z(tfXC>J=vsnR8_?Ez+$*mU=1P$XdpW~ zxSp68oKFOgM#0X*``Y67{q+I^NvPogug^TMD#duhnq1rzT)!{CZ{hFKU-#c7cJ|-+ zUYovt^Kx+frN_mM^p_ULZ^>(`zqUlIL5HBE%W|62U7ez}3bKi2sDDj>JS@3tVEb%d6F4eAHLV=%C*N z`VIg0^=E*uzWH7L+ed!}_{S-KJLjK{`Qx0w{Q0+G01^=J8rc0o^or{5q5KQstMYHz zZv$Ka&>x_G-1*zBf8hTa7SG>m|30waWj;>MKV7}HdmZjyj`{2UiUjFj6ZU&L{tNdX zQ-9K8fpXp*}=lLJA`Mc4-&*z^`UOW2@?N6t_I|hKCi3z~0gs8B8J=Nn9{XK^; zf6pNrCKoTK5fYn}vxSSL-D?Vq9SBY$B!vWmYYAPzhjI*X|NT3~#}yZM@ikxpvQx0I z19>TUxn3uq69`t*z`^7MBC$!BxyoAE+1PsikqR)=L9&D0NO8dp8hGGX3XV4(u4Wci zY@TesZ2oW9e!l|Jc8*pY6t6wKzCQo>MpSRSiA7;Dp2> z4LeW8GL(4ga>7oVm3bmZpFwX?D>XK#aC}za)#B2fz@qQ{8EEq8iLtRNz zLjy+^9L#sx3QkSUl{u_gA8H;xMrwJ0zc+wXy8X3UPlaw)&Vu(k_oxwV=tF5n4unC4 zZJJ4nB!oQ_6jaLDDO3%U6IOz<)}`22Hp+NRgdh&qJ@!UJMAx0Sl^)-|${Ed0r;1Si z(4@(e<^>n0yQp5Qzy34N*CZjO#%49@vTO&4vp&{QtHJd zuu=kzX_D?@e^9F@zY`ds9T7Lt^UVhFUZQj4sXu{65JNL4<<6B;=*IeIsam8;V+$=l zyImmTBko~KLSV{%rE9?!h^C;*SdcxY?23l{OlMAB|KXf6Fq$Qtd{ae{wiJ`*f{?P; z5Od~AYJn&kFBE>9`xmwQ)r$b_na58swhlsfj{u64R@ zGu*gR4#!mw(Rj=|yi-Y8R7t=C{O3w5xZ+oM>1&%?D-CAW&wx8$93g@oKpqE6pf_0{ z7fw9#JU(KAsX3ZjJ(p514H z<&4!rVfdde=l3f)4|_{0CQiZPlLf^BI2Z1Az8f7MzhOXxD9No}TJON_!)*T|s+I=( zE2nbf=Y;r7HQw+lOp(8-o6U+m=(e#Oe6*QUnsZz=$XZD|38p#=Z$XmYFjF|IW_lX8 zI7f~*3?f6Cui|gxaKm!}x$SVA$|KFc9%XzY*4JR9RDWjfR zo@trSL$dD|_wgvoe__k+O+C&60CFFe)j5-Pl0C1vHYOG-w|L^MskVpN?ZT1RrvdfR57 z9pDAj8r7~=vw@+;If(s5`%|4!rF0Am7RC^PSaspj{z?TZez%KMTI1eag{|_!PzTpu z`QvbX+ZEY9N=Ilv8>rX3rdrUfkvqjx6hC_uKuSnVels4qmrJXT0I?A-X}_sQXt)Fl*dj<6}1dy2No9{rk+oK zIGl5s-Co`cZYm5C?1ur%!D?(3L4X~$Fb=U}jtrK@ohKd}9ks7=vO3XKv`NsGMfOP8 z+32+OZ12bpY^S(Um5*q>cd4jb*^gtHIj<{PzT#*mBv^faAGUgsW?16A3i zeGKZ(^EQPw??V4AdQr3P?VavzXOgX8?j)-}`DBo64v1j$lD}J}RW3sCelu8NujKiG zG%6J6O9HH`7PFRTRSZQy7>h|SW8!GsGIz=M?_>Jj{|SaA&luvUDTj7ma`9Up%mZbI zw41S3yj&erVeKRLOvJhpAe}}-)){b1Tq6Wj5XW4U*pQEt~jWo=mTBO9za@7_0U2wigH_Ap`?hOTn z?OQ}AQiC8;oL$sS45QRw{z~Z}tiw(3Uqz*$oTrw~0(AcZzHfoUi9w2pi@GK4{HbB< zV%D#zRl`24oD4bAU%Q2@_Wd6HNq4MM8vm$sN#mOXrxx5E+f$Yn$yQq@qwgnY_PUIZWK%$l0#tI4;Y1)Nf3)`L+4(gAt9zYcgD>L|1I(YGr9hCP58*x{r*95wdbaI4zfY|=SF<$Ze193%&jDcWZl&`Gzyt_Zcxs0X*=1-A5^w>uM0G z3$s~Ih-c5aA1nccnZ4U+UV6V|?A0CRXbU!w1;9`~3AMnKE7zy;RX z&LQ05Z2_N6PU>F5%y`J^TgBluJVK2;Ym`)6Ry zBJSd7`6mYY6H)LBahR(eWa&iFsI9IsXZ%{t;mzu}NC} zRrlH$>>^Kq1LQ_xQ?m2$cn#xp>?~iCl&=9LI7?m>0m%1Phy_+d!2uspFoJ*_JRJWX zU1>NdYUq%Nq+hRb*Wf9%^AAo}2*zrPV*0h+il0Lw?Q>hug)*rYXy#LSIWhtPLUjC* z@`1Q3Qz$w4Z*5J%N|wX}H=3#hAKK6x2rimm?3CraS-ITuZZ=C>_a2`U)_=Imo`&$g zzf$Df!Oe2q1)&laAXoY2$;$M7j-&<)4F|A0dXRZ7`E6B->(UM|-i?g(v$K7i@5Rqm zQX$0poA`xnJVr z6x?we4v3RV2{&Scgbg^>Yaa7R`_LSdlX#Ff6s z;xJ>jVlG)AU772JAMr)im#;B6Xh240*PQkQw^x*lN)oa4u)0#m%HW3=r+hvU9}U8i z&ZoRFqg=~Tqx>So&?P=-f$R&k_v7rtvN+ion;?zkiCNx|!+jwllp9|nCeilToE(U- zP$xTRwg&R9c0BFPz`WPvMWZ~PU9OTM#cE++M|~t|y*2zqvHb&8hYgFx8Z+P@FHk!zu^a1R}*itNo|5<2JP$oUpec zlc0c1e-EX-Etikk&1N@n^L4|cfjGS;(M3s~+WI6rURc$w`tInu0(Apkn7LsQ!jZP@ zc$_u0GR(yvo(?mRhMYD~8wtDKVmhQnRnvl{{FCe{q(2U?sI|3I2nbbEq@rrc^jH%| ztdm2Xjj@#6*?4}t4l}4!l+U1ApbW3bn*vFpq)L@d6M7=EnBqzIXO_)RT$@ajB5%H6 ztw7$zkKfO1dWALHn`^-V6sLsg0?b5$cOx`q9T$5$Pa7OeDWnk-WjdrXHV$|7SfYxve7q9Sm_gdA zE;{Hz=|9isc2AjEn7+x!8Yfd-e!Ers*w?I)KW<5{5|5R5>_<(tI0WNp2@i?on>@>U z|AWCpeQV~H8T55atR3#t>7Wtf6S2uwdfSWQe0lZ!9N#zlc9Q+{$8UcLc7?Qf-5(POt`H z1Dm#d59((|#T~!!S!o9xe4msbowuN+HigD5hr{*3EbU9!pr>XVF?Qn#hj|oBZ~V!O z5>=}4X1yLMw5nszAUBm$_hK}9#7@ubxg6hrENHPzQ*Pk-1L}w_B9quPLa12MCC9Jw zZ)ye7COJCs%kJgrhJ+TSPei`PYiT`TtlAz_0sl98UKCWdZPF=7ZD+S2J^9{n?zeS+iN&z5aPy zu~|EUFPJ}K{!XX-+y0v!@Vfuz1aWe6@%;M|p@XQ2qw~F{<)8vfFI+h~%A1;jssWLK zLIGyc(nJeG(^AV$R1$+q&M=t)-OF4u9vIq|{{=Q$x+uR5CK-VWGilJ8-Yzv-dVJ-u zYd6miR@r)2WKYCq0<_UvS)tQ<`f%F4{!|k+LnIR|NpFw&^cJF;7qi4o}Xo9D$N=3Er$b$dv)&X%&BPcqO-eO^}k56<_Oum|5aFZ z9GcB#chFvncI}a)sDq!;6J&k}ZZK2QFD-}F+b*%Rv|>_c_mG%p&~{ldB}&a`-!x%+ zeo<7z!QD=62YnTAQ1i~7^j|0Wg!roaHQVO!c&nq=DQ%~H5jj0wS0jZGSGrw&SCf}3EGB>D;|EO| zD={gx#*en6$uR}i8VU?d_HVc%^`-qZzH2f!M>P~|F@VTN)AW_^)V)*4cT~nppcvro zfjhFW>dKlJTnR{}yBd@!9JuoBsvlD5OJoi-kjtY}AgyHO%6&e{r9@>a5^88LVB0ee zs{belr*L!SxR>Fiu;MC)O4+F)e-uAVwyj}M`DvMb)Q0;5E#X1Qk@-E=UhG42Xj6+r z&is9)Br~W!kw)%%Hq8_2iSQ|I+B`G^T9NvSn7H5BJZmU{oSgre-ihbrrs;tmfPJlK zn*k`csKTxUmo)0^m7F@R3a$oq*m%|+aUVPv?$xB{>y({Ud{TLN`^Qa?)%E;QEyZ(`-ybY=mm18KEtrr%t z(b&B%vDAP#{b7o}%`w9>n(6D4(B`Yd$L9@l7I0Z*?U+LMdNSvoFxjXpDI&b?>w(?{ z5HrKC_lhHA-yj)`DG0Wg`3PRn2m(o7sXtfP4qQ+q8 zJJF8At*LL~GU^6nXTD;7m)0$_mu{R~jfz~H#JDZMpG*)(5r|yF>$Do*Rg}^lCPD{o zZ7T|xQeyPqhd?&oFTeXxwnOiM_U2lqF|?AjOA9`FiiJ{~=RUqTzSWg(9-4li3CKm9 ziHWQMFAC$l!GZq8tliN5ER1TNHnJ`!Zf*&j#w~6JxKS8@a(qi4&RXXabqp0W^=X+) z))k_78>AX@i`E=EOX9WTWT%wB3m*+)0%?>ZTHZqRh>1_|wb-c+s%S;S)A#Wag^EG3 z4@U}C!LpN34ei-}i$WZFfj}nAxL81n+)DJRK1aFxbO$OVcTRqE-)zOa`S#AZC$8zq z+x^x~1e-Z4mUe+S;5sv;tNHr#sy(0ZfGKh_F^}&&sY1PScvg|oKK2xJ_gNn3k&0X) zcU7K2Jtjia93j;BeO}F+VkFg?arq)w;l@}Bk&8b1y{x|r<3romb4bWZ+56XXsrI1k zH;@BaK5rNMuG!TejHH(3TQWZ9e>+MGsGB*riF>O^$oV{kzfpqqiN*w=WgjDr!>FII z4os7pzc+LEl!3net8Gi?dELwigrz2QW6Cr?HUdziJX`o_q$Wis&-Fgu;HaBwXJcA$ z@gY)}|EC-7k?h$kj|vkI|Jl`D<;U3j)b$MV^iI-?7pg0U3ig`2UC*m0=FVx_X`75r<04F&FFWB1*hR5VTVr2-Z!fSn0zt zVK*6by*8V-1e&dyv*lypx-YZXLeyY80Cid7xRz8a2i*!+JJ~;U;lqmdT7}-+GD}1^ zOqCyC0Cw{H;ziCve#mgxQSg4J=9A{Aeg?XCpsr!mLK91gmV0kdTq14JTO&4XugY6Q zMj$B1G$KID{vC_+cg45XAOk3(pX}n&O<$1AfR4uzLaWHjam87;U2=i#$Te2QI!Z-> zIe7HJPepMyhMZ|}8WwXd?Byx~6?}SEZ+}@wMOvg>ZIu(GB{nq^stMwn1BvV~4YOeo z1KU4Gd`TlhcN`5!q=j)9r4?`EpR=|q9GEl}LbS_EyevV9xwUmN11XDr&+OOyd{1X` zlxot5W?tt#Bo2VEv!N##y=Af(!aly@p$j?r7~~QE3uR589h_TI&rORgKvXqilRLxa z*SXH+ojmPvd3{ni_T*Y;@D-#r;Iw*&cu!FO;f$1@GV+Wt6~wYDD8WNlHo|S`qIPUJ zdOa0zW+eD6WML@10c5^I{9&sp^yaD6{A3(9uPQ*qGe{ch(xy7AKiH+BvQ3Nafpb^# zl;tPw(x*yywpfgppiuY!KC>WAWYd?*n-xQuNd1K{B6s-L|Ku z*;pl#^j$9VxPl^Ze)9W{1iAkhXuK<%u@6=816Oc24(W%N2}UUPp`h?DZ{(=Aa~N%R zF;2tRdrvz7FyoxP9#`D0ntk4wmCIPTwxXT~z<&W4cNtekp<{m>{Y+dl(mB6-77`~7?wfr9mx zAjcSA>&?%hax~rW)jAh)ypY9#C2+jX&dhjl0)gQ$y}LxXF6z!qqwdXm`!AbC+frm> z5wZ;{X!Bnn9P>G~qCzgXE_riGUw+Z6j!ejrtZ9l?2~iH-=Ga?x-#iH;a-(&_?$XJV zUB^p5WMbwo#tRO^D9mDIorL@DP}(&nf%*{oq1q&49myd7hl8!G;~Hz{GuNjCp{SE; z6Gv+DYS-KhE<0RQ45B5Kx320ldkKMh-99{w!c+mEUdN>?Gek4;$?#O<9kiZjmdsmc z;U19U`(7*BV~v<#?Y@n=49*I2UWWX$!pg|XNt3(#VJ^?ScXr527bt{1v~|2Lg|H8% zh#b1DE8|Wh?(mdDuGYu|2zH_!-|vEX*PUr{Mh&od)~t#LMCj-=S-JbvWvb#=Nyox! zSd(->^mDr_;v$8WsXCbpj&NCYY+N^T)s72vSt=mwZ)8ZE_A)o`^Ew~B#tWoj?wNu3 z!?GOb5g5+--;s$e4%pes{OX;0`vjU39=3}D*=%ED8J{sWAASvHD?Nk_?P((DS_}_a zn}%t)5#_#ja)n%jnXP9jZvGB(>oLCmA{UAc+61Iy{9>@wT3b3{N&>cIiOm^9z;sc) z9QU1aeeZkz&KGMgzVm|fmL@@xWmWPe+CbW&DeI$%*n?K?LQA`%cU3+od`W`^kt@aR zwPG!9yO`E)NOed2pQ86Jud8A1U83F^CYJT-gb=nR4GYN{9CwZ3Ls+@mJ;MfFm6z9O2R zTk&QZ3CMEZxVlNMZMLJGb_WX0;$w?dGKncLP3feCc(`m-Pcnn#8>%PoC)Wki=Qv)F zzSF-susn#N7ch+V&!vtW9Md9ZEh&_>0HtK7r_=*E*vu4o-6a&>7n@8xABJ7g>c|DK zBJ#FfYN}V8sPIRriQJbTFTKk=v|v~pgI!CPG0J{XA*->b#15gO`=^tJw%cPztP!W6Xfxc zumnMv{`i`w84-RtB@@1;F6gu5FPq4)MD}T)V#ZSBK~@-h+^1sXK)2Lw-1zx^4=QSW z1x35;Fb|p{EG-r$>|YhWPNA~#QIT7Mk4OMxIH!*oVSUpJ98W;VQ>%_sx%*gVkfz8e~0 zKcykNlBt?o!yl|QA^a+A{`2{q&_lG_4=Wr4{;Qp!6h{>PdDv%2!93=&q!JJxs@doC zbz9fnys*yQc1x52HR}-;j^i^plVte;)q4}{xSxQ&z0VA%p_=^%=-=?kV|wch3`1u5 z5S|8*BU$~q+}$E5gv=f!rd#W@P_(9oH>+b!E6>sVSq?!g5g+HX+wUAg#Oqb6qg)0e zXynZA%*+&mKk%phT!w<;U1J9|NV#W4upF|uzz(vJ7P>^_+smS)9&$PZku-|G(8BCY zi1{ML`goe5RrSfsMOiXsI0q7~(m#@?twx zeExMEFJp_r<=6(+&GB_RNQbm#MWQH@Q67zC@=k_KS~I#jD2ERvJ8Td30MCa&;Lf-|k8HeF!EC~;3Sb*1b zs?v1CvteqG%}9QHH;FzN2wha9H&Rl?TJTBaP6K z2$4`#d`?`U@o0v;*|#5ro%;%PGA9?F9#SNbcB@s~N;h0wi&v$LOZRfj|%*p!{iKKPma5(@g| z13whO8Fluy)O&i6vhCK;5WZ%`tmr!J80*V^OANC~rCizHW0wJt2 zIu(|C@QVcZ`NY$bMQiP7tea>`2`rLWWe=;p*(PZ)b=f1RZ}K}O^Jlx~CcXv+d4*NRBEz{+Te68zE# z!4pCV!Y-97LPW2g>1XL#=!Gu0PU_?EIt!9(*Uz7)$Pu!d!;BR+g!a4PtxqEM{LOYT zKBe25+ccwo0x!4m?sb0Fm@d{*>AUx3&$V0Is;PL`1=*goy4y`Q1Zf?eeP}9((BL6* zUKf~s$FHaR&Z3tTe-m*5Ffp#iy-bOsLP&bxZVG!Hrb6yT#=poo)^V0h+|3n}p)@c7 zKrW=vvPDmXo8FMz+I_Ew&-@`T$H3chJ(o?SG7-H`50O=Wa8)c_`z&gWFzX~6JSv}~ zV^iWR3*w44_G?6JS^^y3D9c_W zKJI*?(?f6$US5o0RpZ0SOBjd7V}q*38xu=o0r>~ z?~fU}#vbEBNZD7bZ(a{uUTNW;b}r76W}a3IlKkudb`Ah9kQV^t1pYBi4*<}=swufx z{@*mfA5AInUO~JO7XEvHzV24mujHTC^O`kNVo`1m?pI>i?+=AD1=61ake7#p^B*z* zH#^@e0sTMp0NeloADGXK7R2{T$N5LyD+TEFz~?{oU)g1^grxtJ0l4{que_}P)B|v_ z|4%(G_E(PCf9L^t_}E|bhJaUM)W5(3xH{2 diff --git a/third-party/zlib/zlib.h b/third-party/zlib/zlib.h index 6b7244f9..8d4b932e 100644 --- a/third-party/zlib/zlib.h +++ b/third-party/zlib/zlib.h @@ -1,7 +1,7 @@ /* zlib.h -- interface of the 'zlib' general purpose compression library - version 1.3, August 18th, 2023 + version 1.3.1, January 22nd, 2024 - Copyright (C) 1995-2023 Jean-loup Gailly and Mark Adler + Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages @@ -37,11 +37,11 @@ extern "C" { #endif -#define ZLIB_VERSION "1.3" -#define ZLIB_VERNUM 0x1300 +#define ZLIB_VERSION "1.3.1" +#define ZLIB_VERNUM 0x1310 #define ZLIB_VER_MAJOR 1 #define ZLIB_VER_MINOR 3 -#define ZLIB_VER_REVISION 0 +#define ZLIB_VER_REVISION 1 #define ZLIB_VER_SUBREVISION 0 /* @@ -936,10 +936,10 @@ ZEXTERN int ZEXPORT inflateSync(z_streamp strm); inflateSync returns Z_OK if a possible full flush point has been found, Z_BUF_ERROR if no more input was provided, Z_DATA_ERROR if no flush point has been found, or Z_STREAM_ERROR if the stream structure was inconsistent. - In the success case, the application may save the current current value of - total_in which indicates where valid compressed data was found. In the - error case, the application may repeatedly call inflateSync, providing more - input each time, until success or end of the input data. + In the success case, the application may save the current value of total_in + which indicates where valid compressed data was found. In the error case, + the application may repeatedly call inflateSync, providing more input each + time, until success or end of the input data. */ ZEXTERN int ZEXPORT inflateCopy(z_streamp dest, @@ -1758,14 +1758,14 @@ ZEXTERN uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2); seq1 and seq2 with lengths len1 and len2, CRC-32 check values were calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32 check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and - len2. + len2. len2 must be non-negative. */ /* ZEXTERN uLong ZEXPORT crc32_combine_gen(z_off_t len2); Return the operator corresponding to length len2, to be used with - crc32_combine_op(). + crc32_combine_op(). len2 must be non-negative. */ ZEXTERN uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op); diff --git a/third-party/zlib/zlib.map b/third-party/zlib/zlib.map index b330b606..31544f2e 100644 --- a/third-party/zlib/zlib.map +++ b/third-party/zlib/zlib.map @@ -1,100 +1,100 @@ -ZLIB_1.2.0 { - global: - compressBound; - deflateBound; - inflateBack; - inflateBackEnd; - inflateBackInit_; - inflateCopy; - local: - deflate_copyright; - inflate_copyright; - inflate_fast; - inflate_table; - zcalloc; - zcfree; - z_errmsg; - gz_error; - gz_intmax; - _*; -}; - -ZLIB_1.2.0.2 { - gzclearerr; - gzungetc; - zlibCompileFlags; -} ZLIB_1.2.0; - -ZLIB_1.2.0.8 { - deflatePrime; -} ZLIB_1.2.0.2; - -ZLIB_1.2.2 { - adler32_combine; - crc32_combine; - deflateSetHeader; - inflateGetHeader; -} ZLIB_1.2.0.8; - -ZLIB_1.2.2.3 { - deflateTune; - gzdirect; -} ZLIB_1.2.2; - -ZLIB_1.2.2.4 { - inflatePrime; -} ZLIB_1.2.2.3; - -ZLIB_1.2.3.3 { - adler32_combine64; - crc32_combine64; - gzopen64; - gzseek64; - gztell64; - inflateUndermine; -} ZLIB_1.2.2.4; - -ZLIB_1.2.3.4 { - inflateReset2; - inflateMark; -} ZLIB_1.2.3.3; - -ZLIB_1.2.3.5 { - gzbuffer; - gzoffset; - gzoffset64; - gzclose_r; - gzclose_w; -} ZLIB_1.2.3.4; - -ZLIB_1.2.5.1 { - deflatePending; -} ZLIB_1.2.3.5; - -ZLIB_1.2.5.2 { - deflateResetKeep; - gzgetc_; - inflateResetKeep; -} ZLIB_1.2.5.1; - -ZLIB_1.2.7.1 { - inflateGetDictionary; - gzvprintf; -} ZLIB_1.2.5.2; - -ZLIB_1.2.9 { - inflateCodesUsed; - inflateValidate; - uncompress2; - gzfread; - gzfwrite; - deflateGetDictionary; - adler32_z; - crc32_z; -} ZLIB_1.2.7.1; - -ZLIB_1.2.12 { - crc32_combine_gen; - crc32_combine_gen64; - crc32_combine_op; -} ZLIB_1.2.9; +ZLIB_1.2.0 { + global: + compressBound; + deflateBound; + inflateBack; + inflateBackEnd; + inflateBackInit_; + inflateCopy; + local: + deflate_copyright; + inflate_copyright; + inflate_fast; + inflate_table; + zcalloc; + zcfree; + z_errmsg; + gz_error; + gz_intmax; + _*; +}; + +ZLIB_1.2.0.2 { + gzclearerr; + gzungetc; + zlibCompileFlags; +} ZLIB_1.2.0; + +ZLIB_1.2.0.8 { + deflatePrime; +} ZLIB_1.2.0.2; + +ZLIB_1.2.2 { + adler32_combine; + crc32_combine; + deflateSetHeader; + inflateGetHeader; +} ZLIB_1.2.0.8; + +ZLIB_1.2.2.3 { + deflateTune; + gzdirect; +} ZLIB_1.2.2; + +ZLIB_1.2.2.4 { + inflatePrime; +} ZLIB_1.2.2.3; + +ZLIB_1.2.3.3 { + adler32_combine64; + crc32_combine64; + gzopen64; + gzseek64; + gztell64; + inflateUndermine; +} ZLIB_1.2.2.4; + +ZLIB_1.2.3.4 { + inflateReset2; + inflateMark; +} ZLIB_1.2.3.3; + +ZLIB_1.2.3.5 { + gzbuffer; + gzoffset; + gzoffset64; + gzclose_r; + gzclose_w; +} ZLIB_1.2.3.4; + +ZLIB_1.2.5.1 { + deflatePending; +} ZLIB_1.2.3.5; + +ZLIB_1.2.5.2 { + deflateResetKeep; + gzgetc_; + inflateResetKeep; +} ZLIB_1.2.5.1; + +ZLIB_1.2.7.1 { + inflateGetDictionary; + gzvprintf; +} ZLIB_1.2.5.2; + +ZLIB_1.2.9 { + inflateCodesUsed; + inflateValidate; + uncompress2; + gzfread; + gzfwrite; + deflateGetDictionary; + adler32_z; + crc32_z; +} ZLIB_1.2.7.1; + +ZLIB_1.2.12 { + crc32_combine_gen; + crc32_combine_gen64; + crc32_combine_op; +} ZLIB_1.2.9; diff --git a/third-party/zlib/zutil.h b/third-party/zlib/zutil.h index 902a304c..48dd7feb 100644 --- a/third-party/zlib/zutil.h +++ b/third-party/zlib/zutil.h @@ -1,5 +1,5 @@ /* zutil.h -- internal interface and configuration of the compression library - * Copyright (C) 1995-2022 Jean-loup Gailly, Mark Adler + * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -56,7 +56,7 @@ typedef unsigned long ulg; extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */ /* (size given to avoid silly warnings with Visual C++) */ -#define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)] +#define ERR_MSG(err) z_errmsg[(err) < -6 || (err) > 2 ? 9 : 2 - (err)] #define ERR_RETURN(strm,err) \ return (strm->msg = ERR_MSG(err), (err)) @@ -137,17 +137,8 @@ extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */ # endif #endif -#if defined(MACOS) || defined(TARGET_OS_MAC) +#if defined(MACOS) # define OS_CODE 7 -# ifndef Z_SOLO -# if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os -# include /* for fdopen */ -# else -# ifndef fdopen -# define fdopen(fd,mode) NULL /* No fdopen() */ -# endif -# endif -# endif #endif #ifdef __acorn @@ -170,18 +161,6 @@ extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */ # define OS_CODE 19 #endif -#if defined(_BEOS_) || defined(RISCOS) -# define fdopen(fd,mode) NULL /* No fdopen() */ -#endif - -#if (defined(_MSC_VER) && (_MSC_VER > 600)) && !defined __INTERIX -# if defined(_WIN32_WCE) -# define fdopen(fd,mode) NULL /* No fdopen() */ -# else -# define fdopen(fd,type) _fdopen(fd,type) -# endif -#endif - #if defined(__BORLANDC__) && !defined(MSDOS) #pragma warn -8004 #pragma warn -8008 diff --git a/third-party/zstd/.cirrus.yml b/third-party/zstd/.cirrus.yml index 27ca65e8..bf3f0c41 100644 --- a/third-party/zstd/.cirrus.yml +++ b/third-party/zstd/.cirrus.yml @@ -2,8 +2,8 @@ task: name: FreeBSD (shortest) freebsd_instance: matrix: - image_family: freebsd-13-0 - image_family: freebsd-12-2 + image_family: freebsd-14-0 + image_family: freebsd-13-2 install_script: pkg install -y gmake coreutils script: | MOREFLAGS="-Werror" gmake -j all diff --git a/third-party/zstd/.github/workflows/commit.yml b/third-party/zstd/.github/workflows/commit.yml new file mode 100644 index 00000000..25d8c52f --- /dev/null +++ b/third-party/zstd/.github/workflows/commit.yml @@ -0,0 +1,89 @@ +name: facebook/zstd/commit +on: + push: + branches: + - dev +permissions: read-all +jobs: + short-tests-0: + runs-on: ubuntu-latest + services: + docker: + image: fbopensource/zstd-circleci-primary:0.0.1 + options: --entrypoint /bin/bash + steps: + - uses: actions/checkout@v4 + - name: Install Dependencies + run: | + sudo apt-get update + sudo apt-get install libcurl4-gnutls-dev + - name: Test + run: | + ./tests/test-license.py + cc -v + CFLAGS="-O0 -Werror -pedantic" make allmost; make clean + make c99build; make clean + make c11build; make clean + make -j regressiontest; make clean + make shortest; make clean + make cxxtest; make clean + short-tests-1: + runs-on: ubuntu-latest + services: + docker: + image: fbopensource/zstd-circleci-primary:0.0.1 + options: --entrypoint /bin/bash + steps: + - uses: actions/checkout@v4 + - name: Install Dependencies + run: | + sudo apt-get update + sudo apt-get install gcc-powerpc-linux-gnu gcc-arm-linux-gnueabi gcc-aarch64-linux-gnu libc6-dev-ppc64-powerpc-cross libcurl4-gnutls-dev lib64gcc-11-dev-powerpc-cross + - name: Test + run: |- + make gnu90build; make clean + make gnu99build; make clean + make ppc64build V=1; make clean + make ppcbuild V=1; make clean + make armbuild V=1; make clean + make aarch64build V=1; make clean + make -C tests test-legacy test-longmatch; make clean + make -C lib libzstd-nomt; make clean + regression-test: + runs-on: ubuntu-latest + services: + docker: + image: fbopensource/zstd-circleci-primary:0.0.1 + options: --entrypoint /bin/bash + env: + CIRCLE_ARTIFACTS: "/tmp/circleci-artifacts" + steps: + - uses: actions/checkout@v4 + - name: restore_cache + uses: actions/cache@v4 + with: + key: regression-cache-{{ checksum "tests/regression/data.c" }}-v0 + path: tests/regression/cache + restore-keys: regression-cache-{{ checksum "tests/regression/data.c" }}-v0 + - name: Install Dependencies + run: | + sudo apt-get update + sudo apt-get install libcurl4-gnutls-dev + - name: Regression Test + run: | + make -C programs zstd + make -C tests/regression test + mkdir -p $CIRCLE_ARTIFACTS + ./tests/regression/test \ + --cache tests/regression/cache \ + --output $CIRCLE_ARTIFACTS/results.csv \ + --zstd programs/zstd + echo "NOTE: The new results.csv is uploaded as an artifact to this job" + echo " If this fails, go to the Artifacts pane in CircleCI, " + echo " download /tmp/circleci-artifacts/results.csv, and if they " + echo " are still good, copy it into the repo and commit it." + echo "> diff tests/regression/results.csv $CIRCLE_ARTIFACTS/results.csv" + diff tests/regression/results.csv $CIRCLE_ARTIFACTS/results.csv + - uses: actions/upload-artifact@v4 + with: + path: "/tmp/circleci-artifacts" diff --git a/third-party/zstd/.github/workflows/dev-long-tests.yml b/third-party/zstd/.github/workflows/dev-long-tests.yml index 16202260..eb8f40a9 100644 --- a/third-party/zstd/.github/workflows/dev-long-tests.yml +++ b/third-party/zstd/.github/workflows/dev-long-tests.yml @@ -15,7 +15,7 @@ jobs: make-all: runs-on: ubuntu-latest steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: make all run: make all @@ -26,7 +26,7 @@ jobs: DEVNULLRIGHTS: 1 READFROMBLOCKDEVICE: 1 steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: make test run: make test @@ -34,7 +34,7 @@ jobs: make-test-osx: runs-on: macos-latest steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: OS-X test run: make test # make -c lib all doesn't work because of the fact that it's not a tty @@ -45,7 +45,7 @@ jobs: DEVNULLRIGHTS: 1 READFROMBLOCKDEVICE: 1 steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: make test run: | sudo apt-get -qqq update @@ -55,29 +55,29 @@ jobs: no-intrinsics-fuzztest: runs-on: ubuntu-latest steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: no intrinsics fuzztest run: MOREFLAGS="-DZSTD_NO_INTRINSICS" make -C tests fuzztest tsan-zstreamtest: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: thread sanitizer zstreamtest run: CC=clang ZSTREAM_TESTTIME=-T3mn make tsan-test-zstream ubsan-zstreamtest: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: undefined behavior sanitizer zstreamtest run: CC=clang make uasan-test-zstream # lasts ~15mn tsan-fuzztest: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: thread sanitizer fuzztest run: CC=clang make tsan-fuzztest @@ -85,7 +85,7 @@ jobs: big-tests-zstreamtest32: runs-on: ubuntu-latest steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: zstream tests in 32bit mode, with big tests run: | sudo apt-get -qqq update @@ -94,9 +94,9 @@ jobs: # lasts ~23mn gcc-8-asan-ubsan-testzstd: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: gcc-8 + ASan + UBSan + Test Zstd # See https://askubuntu.com/a/1428822 run: | @@ -106,16 +106,16 @@ jobs: CC=gcc-8 make -j uasan-test-zstd - @@ -524,31 +533,12 @@ jobs: make -C tests fuzzer && ./tests/fuzzer.exe -v -T1m - intel-cet-compatibility: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 - - name: Build Zstd - run: | - make -j zstd V=1 - readelf -n zstd - - name: Get Intel SDE - run: | - curl -LO https://downloadmirror.intel.com/684899/sde-external-9.0.0-2021-11-07-lin.tar.xz - tar xJvf sde-external-9.0.0-2021-11-07-lin.tar.xz - - name: Configure Permissions - run: | - echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope - - name: Run Under SDE - run: | - sde-external-9.0.0-2021-11-07-lin/sde -cet -cet-raise 0 -cet-endbr-exe -cet-stderr -cet-abort -- ./zstd -b3 - pkg-config: runs-on: ubuntu-latest container: image: debian:testing steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: Install dependencies run: | apt -y update @@ -563,7 +553,7 @@ jobs: versions-compatibility: runs-on: ubuntu-latest steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: Versions Compatibility Test run: | make -C tests versionsTest @@ -571,7 +561,7 @@ jobs: clangbuild: runs-on: ubuntu-latest steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: make clangbuild run: | make clangbuild @@ -579,7 +569,7 @@ jobs: clang-pgo: runs-on: ubuntu-latest steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: Build PGO Zstd with Clang env: CC: clang-14 @@ -591,7 +581,7 @@ jobs: gcc-pgo: runs-on: ubuntu-latest steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 - name: Build PGO Zstd with GCC env: CC: gcc @@ -599,10 +589,29 @@ jobs: make -C programs zstd-pgo ./programs/zstd -b + intel-cet-compatibility: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 + - name: Build Zstd + run: | + make -j zstd V=1 + readelf -n zstd + - name: Get Intel SDE + run: | + curl -LO https://downloadmirror.intel.com/813591/sde-external-9.33.0-2024-01-07-lin.tar.xz + tar xJvf sde-external-9.33.0-2024-01-07-lin.tar.xz + - name: Configure Permissions + run: | + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope + - name: Run Under SDE + run: | + sde-external-9.33.0-2024-01-07-lin/sde -cet -cet-raise 0 -cet-endbr-exe -cet-stderr -cet-abort -- ./zstd -b3 + + +# Failing tests, for reference -# For reference : icc tests # icc tests are currently failing on Github Actions, likely to issues during installation stage -# To be fixed later # # icc: # name: icc-check @@ -618,7 +627,7 @@ jobs: # sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" # sudo apt-get update # sudo apt-get install -y intel-basekit intel-hpckit -# - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 +# - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v4.1.1 # - name: make check # run: | # make CC=/opt/intel/oneapi/compiler/latest/linux/bin/intel64/icc check diff --git a/third-party/zstd/.github/workflows/nightly.yml b/third-party/zstd/.github/workflows/nightly.yml new file mode 100644 index 00000000..704e7892 --- /dev/null +++ b/third-party/zstd/.github/workflows/nightly.yml @@ -0,0 +1,65 @@ +name: facebook/zstd/nightly +on: + schedule: + - cron: 0 0 * * * + push: + branches: + - release + - dev + - master +permissions: read-all +jobs: + regression-test: + runs-on: ubuntu-latest + services: + docker: + image: fbopensource/zstd-circleci-primary:0.0.1 + options: --entrypoint /bin/bash + env: + CIRCLE_ARTIFACTS: "/tmp/circleci-artifacts" + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + key: regression-cache-{{ checksum "tests/regression/data.c" }}-v0 + path: tests/regression/cache + restore-keys: regression-cache-{{ checksum "tests/regression/data.c" }}-v0 + - uses: actions/upload-artifact@v4 + with: + path: "/tmp/circleci-artifacts" + - name: Install Dependencies + run: | + sudo apt-get update + sudo apt-get install libcurl4-gnutls-dev + - name: Regression Test + run: | + make -C programs zstd + make -C tests/regression test + mkdir -p $CIRCLE_ARTIFACTS + ./tests/regression/test \ + --cache tests/regression/cache \ + --output $CIRCLE_ARTIFACTS/results.csv \ + --zstd programs/zstd + echo "NOTE: The new results.csv is uploaded as an artifact to this job" + echo " If this fails, go to the Artifacts pane in CircleCI, " + echo " download /tmp/circleci-artifacts/results.csv, and if they " + echo " are still good, copy it into the repo and commit it." + echo "> diff tests/regression/results.csv $CIRCLE_ARTIFACTS/results.csv" + diff tests/regression/results.csv $CIRCLE_ARTIFACTS/results.csv + +# Longer tests + #- make -C tests test-zstd-nolegacy && make clean + #- pyenv global 3.4.4; make -C tests versionsTest && make clean + #- make zlibwrapper && make clean + #- gcc -v; make -C tests test32 MOREFLAGS="-I/usr/include/x86_64-linux-gnu" && make clean + #- make uasan && make clean + #- make asan32 && make clean + #- make -C tests test32 CC=clang MOREFLAGS="-g -fsanitize=address -I/usr/include/x86_64-linux-gnu" +# Valgrind tests + #- CFLAGS="-O1 -g" make -C zlibWrapper valgrindTest && make clean + #- make -C tests valgrindTest && make clean +# ARM, AArch64, PowerPC, PowerPC64 tests + #- make ppctest && make clean + #- make ppc64test && make clean + #- make armtest && make clean + #- make aarch64test && make clean diff --git a/third-party/zstd/.github/workflows/publish-release-artifacts.yml b/third-party/zstd/.github/workflows/publish-release-artifacts.yml index 3c942394..f7af2791 100644 --- a/third-party/zstd/.github/workflows/publish-release-artifacts.yml +++ b/third-party/zstd/.github/workflows/publish-release-artifacts.yml @@ -17,7 +17,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v3 - name: Archive env: diff --git a/third-party/zstd/.github/workflows/scorecards.yml b/third-party/zstd/.github/workflows/scorecards.yml index 8a693fa2..a5d5f02a 100644 --- a/third-party/zstd/.github/workflows/scorecards.yml +++ b/third-party/zstd/.github/workflows/scorecards.yml @@ -27,12 +27,12 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v3 with: persist-credentials: false - name: "Run analysis" - uses: ossf/scorecard-action@e38b1902ae4f44df626f11ba0734b14fb91f8f86 # tag=v2.1.2 + uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # tag=v2.3.1 with: results_file: results.sarif results_format: sarif @@ -51,7 +51,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # tag=v3.1.2 + uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # tag=v4.3.1 with: name: SARIF file path: results.sarif @@ -59,6 +59,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@67a35a08586135a9573f4327e904ecbf517a882d # tag=v2.2.8 + uses: github/codeql-action/upload-sarif@3ab4101902695724f9365a384f86c1074d94e18c # tag=v3.24.7 with: sarif_file: results.sarif diff --git a/third-party/zstd/.github/workflows/windows-artifacts.yml b/third-party/zstd/.github/workflows/windows-artifacts.yml index 7d73b4b0..52bc90a4 100644 --- a/third-party/zstd/.github/workflows/windows-artifacts.yml +++ b/third-party/zstd/.github/workflows/windows-artifacts.yml @@ -10,19 +10,26 @@ on: permissions: read-all jobs: - windows-64-artifacts: + windows-artifacts: # see https://ariya.io/2020/07/on-github-actions-with-msys2 runs-on: windows-latest + # see https://github.com/msys2/setup-msys2 + strategy: + matrix: + include: + - { msystem: mingw64, env: x86_64, ziparch: win64 } + - { msystem: mingw32, env: i686, ziparch: win32 } defaults: run: shell: msys2 {0} steps: - - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # tag=v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # tag=v3 - uses: msys2/setup-msys2@5beef6d11f48bba68b9eb503e3adc60b23c0cc36 # tag=v2 with: - msystem: MINGW64 - install: make zlib git p7zip mingw-w64-x86_64-gcc + msystem: ${{ matrix.msystem }} + install: make zlib git p7zip mingw-w64-${{matrix.env}}-gcc update: true + - name: display versions run: | make -v @@ -33,19 +40,19 @@ jobs: git clone --depth 1 --branch v1.2.11 https://github.com/madler/zlib make -C zlib -f win32/Makefile.gcc libz.a - - name: Building zstd programs in 64-bit mode + - name: Building zstd programs run: | CPPFLAGS=-I../zlib LDFLAGS=../zlib/libz.a make -j allzstd MOREFLAGS=-static V=1 - name: Create artifacts run: | ./lib/dll/example/build_package.bat - mv bin/ zstd-${{ github.ref_name }}-win64/ - 7z a -tzip -mx9 zstd-${{ github.ref_name }}-win64.zip zstd-${{ github.ref_name }}-win64/ + mv bin/ zstd-${{ github.ref_name }}-${{matrix.ziparch}}/ + 7z a -tzip -mx9 zstd-${{ github.ref_name }}-${{matrix.ziparch}}.zip zstd-${{ github.ref_name }}-${{matrix.ziparch}}/ cd .. - - name: Publish zstd-$VERSION-win64.zip - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # tag=v3 + - name: Publish zstd-$VERSION-${{matrix.ziparch}}.zip + uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # tag=v4.3.1 with: - path: ${{ github.workspace }}/zstd-${{ github.ref_name }}-win64.zip - name: zstd-${{ github.ref_name }}-win64.zip + path: ${{ github.workspace }}/zstd-${{ github.ref_name }}-${{matrix.ziparch}}.zip + name: zstd-${{ github.ref_name }}-${{matrix.ziparch}}.zip diff --git a/third-party/zstd/.gitignore b/third-party/zstd/.gitignore index a136ea39..34e18b44 100644 --- a/third-party/zstd/.gitignore +++ b/third-party/zstd/.gitignore @@ -27,6 +27,8 @@ tmp* dictionary. dictionary NUL +cmakebuild/ +install/ # Build artefacts contrib/linux-kernel/linux/ @@ -37,11 +39,15 @@ buck-out/ build-* *.gcda +# IDE +.clang_complete +compile_flags.txt +.clang-format + # Other files .directory _codelite/ _zstdbench/ -.clang_complete *.idea *.swp .DS_Store diff --git a/third-party/zstd/.travis.yml b/third-party/zstd/.travis.yml deleted file mode 100644 index b96bf8ba..00000000 --- a/third-party/zstd/.travis.yml +++ /dev/null @@ -1,128 +0,0 @@ -# Travis CI is used to test platforms that github-actions currently doesn't support -# without either self-hosting or some finnicky work-around. Also, some tests -# are troublesome to migrate since GH Actions runs tests not in a tty. -language: c - -git: - depth: 1 - -branches: - only: - - dev - - release - - master - - travisTest - -addons: - apt: - update: true - -env: - global: - - FUZZERTEST=-T1mn - ZSTREAM_TESTTIME=-T1mn - DECODECORPUS_TESTTIME=-T1mn - -matrix: - fast_finish: true - include: - - name: S390X (big endian) + Fuzz test - dist: trusty - arch: s390x - script: - - FUZZER_FLAGS=--no-big-tests make -C tests fuzztest - - - name: S390X (big endian) + Fuzz test + no intrinsics - dist: trusty - arch: s390x - script: - - MOREFLAGS="-DZSTD_NO_INTRINSICS" FUZZER_FLAGS=--no-big-tests make -C tests fuzztest - - - name: arm64 # ~2.5 mn - os: linux - arch: arm64 - script: - - make check - - - name: arm64fuzz - os: linux - arch: arm64 - script: - - make -C tests fuzztest - - # TODO: migrate to GH Actions once newest clang staticanalyze warnings are fixed - - name: static analyzer scanbuild # ~8mn - dist: trusty # note : it's important to pin down a version of static analyzer, since different versions report different false positives - script: - - make staticAnalyze - - # GH actions can't run this command on OS-X, non-tty issues - - name: OS-X make all lib - os: osx - script: - - make -C lib all - - # Introduced to check compat with old toolchains, to prevent e.g. #1872 - - name: ARM Build Test (on Trusty) - dist: trusty - script: - - make arminstall - - make armbuild - - # check release number (release/new tag only) - - name: Tag-Specific Test - if: tag =~ ^v[0-9]\.[0-9] - script: - - make -C tests checkTag - - tests/checkTag "$TRAVIS_BRANCH" - - - name: PPC64LE + Fuzz test # ~13mn - arch: ppc64le - env: - - FUZZER_FLAGS=--no-big-tests - - MOREFLAGS="-static" - script: - - cat /proc/cpuinfo - - make -C tests fuzztest - - # This test currently fails on GA specifically, for no obvious reason - # (it works fine on travisCI, and on local test platforms). - - name: Versions Compatibility Test # ~6mn - script: - - make -C tests versionsTest - - # meson dedicated test - - name: Focal (Meson + clang) # ~15mn - dist: focal - language: cpp - compiler: clang - install: - - sudo apt-get install -qq liblz4-dev valgrind tree - - | - travis_retry curl -o ~/ninja.zip -L 'https://github.com/ninja-build/ninja/releases/download/v1.9.0/ninja-linux.zip' && - unzip ~/ninja.zip -d ~/.local/bin - - | - travis_retry curl -o ~/get-pip.py -L 'https://bootstrap.pypa.io/pip/3.6/get-pip.py' && - python3 ~/get-pip.py --user && - pip3 install --user meson - script: - - | - meson setup \ - --buildtype=debugoptimized \ - -Db_lundef=false \ - -Dauto_features=enabled \ - -Dbin_programs=true \ - -Dbin_tests=true \ - -Dbin_contrib=true \ - -Ddefault_library=both \ - build/meson builddir - - pushd builddir - - ninja - - meson test --verbose --no-rebuild - - DESTDIR=./staging ninja install - - tree ./staging - after_failure: - - cat "$TRAVIS_BUILD_DIR"/builddir/meson-logs/testlog.txt - - allow_failures: - - env: ALLOW_FAILURES=true diff --git a/third-party/zstd/CHANGELOG b/third-party/zstd/CHANGELOG index c7a7506e..33f43410 100644 --- a/third-party/zstd/CHANGELOG +++ b/third-party/zstd/CHANGELOG @@ -1,3 +1,40 @@ +V1.5.6 (Mar 2024) +api: Promote `ZSTD_c_targetCBlockSize` to Stable API by @felixhandte +api: new `ZSTD_d_maxBlockSize` experimental parameter, to reduce streaming decompression memory, by @terrelln +perf: improve performance of param `ZSTD_c_targetCBlockSize`, by @Cyan4973 +perf: improved compression of arrays of integers at high compression, by @Cyan4973 +lib: reduce binary size with selective built-time exclusion, by @felixhandte +lib: improved huffman speed on small data and linux kernel, by @terrelln +lib: accept dictionaries with partial literal tables, by @terrelln +lib: fix CCtx size estimation with external sequence producer, by @embg +lib: fix corner case decoder behaviors, by @Cyan4973 and @aimuz +lib: fix zdict prototype mismatch in static_only mode, by @ldv-alt +lib: fix several bugs in magicless-format decoding, by @embg +cli: add common compressed file types to `--exclude-compressed`` by @daniellerozenblit +cli: fix mixing `-c` and `-o` commands with `--rm`, by @Cyan4973 +cli: fix erroneous exclusion of hidden files with `--output-dir-mirror` by @felixhandte +cli: improved time accuracy on BSD, by @felixhandte +cli: better errors on argument parsing, by @KapJI +tests: better compatibility with older versions of `grep`, by @Cyan4973 +tests: lorem ipsum generator as default backup content, by @Cyan4973 +build: cmake improvements by @terrelln, @sighingnow, @gjasny, @JohanMabille, @Saverio976, @gruenich, @teo-tsirpanis +build: bazel support, by @jondo2010 +build: fix cross-compiling for AArch64 with lld by @jcelerier +build: fix Apple platform compatibility, by @nidhijaju +build: fix Visual 2012 and lower compatibility, by @Cyan4973 +build: improve win32 support, by @DimitriPapadopoulos +build: better C90 compliance for zlibWrapper, by @emaste +port: make: fat binaries on macos, by @mredig +port: ARM64EC compatibility for Windows, by @dunhor +port: QNX support by @klausholstjacobsen +port: MSYS2 and Cygwin makefile installation and test support, by @QBos07 +port: risc-v support validation in CI, by @Cyan4973 +port: sparc64 support validation in CI, by @Cyan4973 +port: AIX compatibility, by @likema +port: HP-UX compatibility, by @likema +doc: Improved specification accuracy, by @elasota +bug: Fix and deprecate ZSTD_generateSequences (#3981) + v1.5.5 (Apr 2023) fix: fix rare corruption bug affecting the high compression mode, reported by @danlark1 (#3517, @terrelln) perf: improve mid-level compression speed (#3529, #3533, #3543, @yoniko and #3552, @terrelln) @@ -98,7 +135,7 @@ build: support for m68k (Motorola 68000's), by @cyan4973 build: improved AIX support, by @Helflym build: improved meson unofficial build, by @eli-schwartz cli : custom memory limit when training dictionary (#2925), by @embg -cli : report advanced parameters information when compressing in very verbose mode (``-vv`), by @Svetlitski-FB +cli : report advanced parameters information when compressing in very verbose mode (`-vv`), by @Svetlitski-FB v1.5.0 (May 11, 2021) api: Various functions promoted from experimental to stable API: (#2579-2581, @senhuang42) @@ -165,7 +202,7 @@ api: Add Function to Generate Skippable Frame (#2439, @senhuang42) perf: New Algorithms for the Long Distance Matcher (#2483, @mpu) perf: Performance Improvements for Long Distance Matcher (#2464, @mpu) perf: Don't Shrink Window Log when Streaming with a Dictionary (#2451, @terrelln) -cli: Fix `--output-dir-mirror`'s Rejection of `..`-Containing Paths (#2512, @felixhandte) +cli: Fix `--output-dir-mirror` rejection of `..` -containing paths (#2512, @felixhandte) cli: Allow Input From Console When `-f`/`--force` is Passed (#2466, @felixhandte) cli: Improve Help Message (#2500, @senhuang42) tests: Remove Flaky Tests (#2455, #2486, #2445, @Cyan4973) diff --git a/third-party/zstd/CONTRIBUTING.md b/third-party/zstd/CONTRIBUTING.md index f5e747ae..47f5bb8f 100644 --- a/third-party/zstd/CONTRIBUTING.md +++ b/third-party/zstd/CONTRIBUTING.md @@ -171,8 +171,8 @@ who want earlier signal. | Cirrus CI | Used for testing on FreeBSD | https://github.com/marketplace/cirrus-ci/ | `.cirrus.yml` | | Circle CI | Historically was used to provide faster signal,
but we may be able to migrate these to Github Actions | https://circleci.com/docs/2.0/getting-started/#setting-up-circleci
https://youtu.be/Js3hMUsSZ2c
https://circleci.com/docs/2.0/enable-checks/ | `.circleci/config.yml` | -Note: the instructions linked above mostly cover how to set up a repository with CI from scratch. -The general idea should be the same for setting up CI on your fork of zstd, but you may have to +Note: the instructions linked above mostly cover how to set up a repository with CI from scratch. +The general idea should be the same for setting up CI on your fork of zstd, but you may have to follow slightly different steps. In particular, please ignore any instructions related to setting up config files (since zstd already has configs for each of these services). @@ -216,7 +216,7 @@ will typically not be stable enough to obtain reliable benchmark results. If you hands on a desktop, this is usually a better scenario. Of course, benchmarking can be done on non-hyper-stable machines as well. You will just have to -do a little more work to ensure that you are in fact measuring the changes you've made not and +do a little more work to ensure that you are in fact measuring the changes you've made and not noise. Here are some things you can do to make your benchmarks more stable: 1. The most simple thing you can do to drastically improve the stability of your benchmark is diff --git a/third-party/zstd/Makefile b/third-party/zstd/Makefile index 3b2e3999..11eca19c 100644 --- a/third-party/zstd/Makefile +++ b/third-party/zstd/Makefile @@ -145,13 +145,13 @@ clean: $(Q)$(MAKE) -C contrib/largeNbDicts $@ > $(VOID) $(Q)$(MAKE) -C contrib/externalSequenceProducer $@ > $(VOID) $(Q)$(RM) zstd$(EXT) zstdmt$(EXT) tmp* - $(Q)$(RM) -r lz4 + $(Q)$(RM) -r lz4 cmakebuild install @echo Cleaning completed #------------------------------------------------------------------------------ # make install is validated only for Linux, macOS, Hurd and some BSD targets #------------------------------------------------------------------------------ -ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD DragonFly NetBSD MSYS_NT Haiku AIX)) +ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD DragonFly NetBSD MSYS_NT CYGWIN_NT Haiku AIX)) HOST_OS = POSIX @@ -197,6 +197,15 @@ uninstall: travis-install: $(MAKE) install PREFIX=~/install_test_dir +.PHONY: clangbuild-darwin-fat +clangbuild-darwin-fat: clean + clang -v + CXX=clang++ CC=clang CFLAGS="-Werror -Wconversion -Wno-sign-conversion -Wdocumentation -arch arm64" $(MAKE) zstd-release + mv programs/zstd programs/zstd_arm64 + CXX=clang++ CC=clang CFLAGS="-Werror -Wconversion -Wno-sign-conversion -Wdocumentation -arch x86_64" $(MAKE) zstd-release + mv programs/zstd programs/zstd_x64 + lipo -create programs/zstd_x64 programs/zstd_arm64 -output programs/zstd + .PHONY: gcc5build gcc6build gcc7build clangbuild m32build armbuild aarch64build ppcbuild ppc64build gcc5build: clean gcc-5 -v @@ -308,7 +317,7 @@ update_regressionResults: # run UBsan with -fsanitize-recover=pointer-overflow # this only works with recent compilers such as gcc 8+ usan: clean - $(MAKE) test CC=clang MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize-recover=pointer-overflow -fsanitize=undefined -Werror $(MOREFLAGS)" + $(MAKE) test CC=clang MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize=undefined -Werror $(MOREFLAGS)" asan: clean $(MAKE) test CC=clang MOREFLAGS="-g -fsanitize=address -Werror $(MOREFLAGS)" @@ -319,17 +328,18 @@ asan-%: clean msan: clean $(MAKE) test CC=clang MOREFLAGS="-g -fsanitize=memory -fno-omit-frame-pointer -Werror $(MOREFLAGS)" HAVE_LZMA=0 # datagen.c fails this test for no obvious reason -msan-%: clean - LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize=memory -fno-omit-frame-pointer -Werror $(MOREFLAGS)" FUZZER_FLAGS="--no-big-tests $(FUZZER_FLAGS)" $(MAKE) -C $(TESTDIR) HAVE_LZMA=0 $* +msan-%: + $(MAKE) clean + LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize=memory -fno-omit-frame-pointer -Werror $(MOREFLAGS)" FUZZER_FLAGS="--no-big-tests $(FUZZER_FLAGS)" $(MAKE) -j -C $(TESTDIR) HAVE_LZMA=0 $* asan32: clean $(MAKE) -C $(TESTDIR) test32 CC=clang MOREFLAGS="-g -fsanitize=address $(MOREFLAGS)" uasan: clean - $(MAKE) test CC=clang MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize-recover=pointer-overflow -fsanitize=address,undefined -Werror $(MOREFLAGS)" + $(MAKE) test CC=clang MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize=address,undefined -Werror $(MOREFLAGS)" uasan-%: clean - LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize-recover=pointer-overflow -fsanitize=address,undefined -Werror $(MOREFLAGS)" $(MAKE) -C $(TESTDIR) $* + LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize=address,undefined -Werror $(MOREFLAGS)" $(MAKE) -C $(TESTDIR) $* tsan-%: clean LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize=thread -Werror $(MOREFLAGS)" $(MAKE) -C $(TESTDIR) $* FUZZER_FLAGS="--no-big-tests $(FUZZER_FLAGS)" @@ -380,28 +390,32 @@ lz4install: endif -CMAKE_PARAMS = -DZSTD_BUILD_CONTRIB:BOOL=ON -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON -DZSTD_ZLIB_SUPPORT:BOOL=ON -DZSTD_LZMA_SUPPORT:BOOL=ON -DCMAKE_BUILD_TYPE=Release - ifneq (,$(filter MSYS%,$(shell uname))) HOST_OS = MSYS -CMAKE_PARAMS = -G"MSYS Makefiles" -DCMAKE_BUILD_TYPE=Debug -DZSTD_MULTITHREAD_SUPPORT:BOOL=OFF -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON endif #------------------------------------------------------------------------ # target specific tests #------------------------------------------------------------------------ ifneq (,$(filter $(HOST_OS),MSYS POSIX)) -.PHONY: cmakebuild c89build gnu90build c99build gnu99build c11build bmix64build bmix32build bmi32build staticAnalyze -cmakebuild: - cmake --version - $(RM) -r $(BUILDIR)/cmake/build - $(MKDIR) $(BUILDIR)/cmake/build - cd $(BUILDIR)/cmake/build; cmake -DCMAKE_INSTALL_PREFIX:PATH=~/install_test_dir $(CMAKE_PARAMS) .. - $(MAKE) -C $(BUILDIR)/cmake/build -j4; - $(MAKE) -C $(BUILDIR)/cmake/build install; - $(MAKE) -C $(BUILDIR)/cmake/build uninstall; - cd $(BUILDIR)/cmake/build; ctest -V -L Medium +CMAKE ?= cmake +CMAKE_PARAMS = -DZSTD_BUILD_CONTRIB:BOOL=ON -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON -DZSTD_ZLIB_SUPPORT:BOOL=ON -DZSTD_LZMA_SUPPORT:BOOL=ON + +ifneq (,$(filter MSYS%,$(shell uname))) +CMAKE_PARAMS = -G"MSYS Makefiles" -DZSTD_MULTITHREAD_SUPPORT:BOOL=OFF -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON +endif + +.PHONY: cmakebuild +cmakebuild: + $(CMAKE) --version + $(RM) -r cmakebuild install + $(MKDIR) cmakebuild install + cd cmakebuild; $(CMAKE) -Wdev -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS="-Werror -O0" -DCMAKE_INSTALL_PREFIX=install $(CMAKE_PARAMS) ../build/cmake + $(CMAKE) --build cmakebuild --target install -- -j V=1 + cd cmakebuild; ctest -V -L Medium + +.PHONY: c89build gnu90build c99build gnu99build c11build bmix64build bmix32build bmi32build staticAnalyze c89build: clean $(CC) -v CFLAGS="-std=c89 -Werror -Wno-attributes -Wpedantic -Wno-long-long -Wno-variadic-macros -O0" $(MAKE) lib zstd diff --git a/third-party/zstd/README.md b/third-party/zstd/README.md index f91e68fd..0f7478e1 100644 --- a/third-party/zstd/README.md +++ b/third-party/zstd/README.md @@ -5,7 +5,7 @@ targeting real-time compression scenarios at zlib-level and better compression r It's backed by a very fast entropy stage, provided by [Huff0 and FSE library](https://github.com/Cyan4973/FiniteStateEntropy). Zstandard's format is stable and documented in [RFC8878](https://datatracker.ietf.org/doc/html/rfc8878). Multiple independent implementations are already available. -This repository represents the reference implementation, provided as an open-source dual [BSD](LICENSE) and [GPLv2](COPYING) licensed **C** library, +This repository represents the reference implementation, provided as an open-source dual [BSD](LICENSE) OR [GPLv2](COPYING) licensed **C** library, and a command line utility producing and decoding `.zst`, `.gz`, `.xz` and `.lz4` files. Should your project require another programming language, a list of known ports and bindings is provided on [Zstandard homepage](https://facebook.github.io/zstd/#other-languages). @@ -198,6 +198,10 @@ Going into `build` directory, you will find additional possibilities: You can build the zstd binary via buck by executing: `buck build programs:zstd` from the root of the repo. The output binary will be in `buck-out/gen/programs/`. +### Bazel + +You easily can integrate zstd into your Bazel project by using the module hosted on the [Bazel Central Repository](https://registry.bazel.build/modules/zstd). + ## Testing You can run quick local smoke tests by running `make check`. @@ -213,7 +217,7 @@ Zstandard is considered safe for production environments. ## License -Zstandard is dual-licensed under [BSD](LICENSE) and [GPLv2](COPYING). +Zstandard is dual-licensed under [BSD](LICENSE) OR [GPLv2](COPYING). ## Contributing diff --git a/third-party/zstd/SECURITY.md b/third-party/zstd/SECURITY.md new file mode 100644 index 00000000..a5f9a7e1 --- /dev/null +++ b/third-party/zstd/SECURITY.md @@ -0,0 +1,15 @@ +# Reporting and Fixing Security Issues + +Please do not open GitHub issues or pull requests - this makes the problem immediately visible to everyone, including malicious actors. Security issues in this open source project can be safely reported via the Meta Bug Bounty program: + +https://www.facebook.com/whitehat + +Meta's security team will triage your report and determine whether or not is it eligible for a bounty under our program. + +# Receiving Vulnerability Notifications + +In the case that a significant security vulnerability is reported to us or discovered by us---without being publicly known---we will, at our discretion, notify high-profile, high-exposure users of Zstandard ahead of our public disclosure of the issue and associated fix. + +If you believe your project would benefit from inclusion in this list, please reach out to one of the maintainers. + + diff --git a/third-party/zstd/appveyor.yml b/third-party/zstd/appveyor.yml deleted file mode 100644 index c58ef91a..00000000 --- a/third-party/zstd/appveyor.yml +++ /dev/null @@ -1,205 +0,0 @@ -# Following tests are run _only_ on `release` branch -# and on selected feature branch named `appveyorTest` or `visual*` - -- - version: 1.0.{build} - branches: - only: - - release - - master - - /appveyor*/ - - /visual*/ - environment: - matrix: - - COMPILER: "gcc" - HOST: "mingw" - PLATFORM: "x64" - SCRIPT: "make allzstd MOREFLAGS=-static" - ARTIFACT: "true" - BUILD: "true" - - COMPILER: "gcc" - HOST: "mingw" - PLATFORM: "x86" - SCRIPT: "make allzstd MOREFLAGS=-static" - ARTIFACT: "true" - BUILD: "true" - - - COMPILER: "clang-cl" - HOST: "cmake-visual" - PLATFORM: "x64" - CONFIGURATION: "Release" - CMAKE_GENERATOR: "Visual Studio 15 2017" - CMAKE_GENERATOR_PLATFORM: "x64" - CMAKE_GENERATOR_TOOLSET: "LLVM" - APPVEYOR_BUILD_WORKER_IMAGE: "Visual Studio 2017" - - install: - - ECHO Installing %COMPILER% %PLATFORM% %CONFIGURATION% - - SET PATH_ORIGINAL=%PATH% - - if [%HOST%]==[mingw] ( - SET "PATH_MINGW32=C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw32\bin" && - SET "PATH_MINGW64=C:\mingw-w64\x86_64-6.3.0-posix-seh-rt_v5-rev1\mingw64\bin" && - COPY C:\msys64\usr\bin\make.exe C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw32\bin\make.exe && - COPY C:\msys64\usr\bin\make.exe C:\mingw-w64\x86_64-6.3.0-posix-seh-rt_v5-rev1\mingw64\bin\make.exe - ) - - IF [%HOST%]==[visual] IF [%PLATFORM%]==[x64] ( - SET ADDITIONALPARAM=/p:LibraryPath="C:\Program Files\Microsoft SDKs\Windows\v7.1\lib\x64;c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\lib\amd64;C:\Program Files (x86)\Microsoft Visual Studio 10.0\;C:\Program Files (x86)\Microsoft Visual Studio 10.0\lib\amd64;" - ) - - build_script: - - if [%HOST%]==[mingw] ( - ( if [%PLATFORM%]==[x64] ( - SET "PATH=%PATH_MINGW64%;%PATH_ORIGINAL%" - ) else if [%PLATFORM%]==[x86] ( - SET "PATH=%PATH_MINGW32%;%PATH_ORIGINAL%" - ) ) - ) - - if [%HOST%]==[mingw] if [%BUILD%]==[true] ( - make -v && - sh -c "%COMPILER% -v" && - ECHO Building zlib to static link && - SET "CC=%COMPILER%" && - sh -c "cd .. && git clone --depth 1 --branch v1.2.11 https://github.com/madler/zlib" && - sh -c "cd ../zlib && make -f win32/Makefile.gcc libz.a" - ECHO Building zstd && - SET "CPPFLAGS=-I../../zlib" && - SET "LDFLAGS=../../zlib/libz.a" && - sh -c "%SCRIPT%" && - ( if [%COMPILER%]==[gcc] if [%ARTIFACT%]==[true] - ECHO Creating artifacts && - ECHO %cd% && - lib\dll\example\build_package.bat && - make -C programs DEBUGFLAGS= clean zstd && - cd programs\ && 7z a -tzip -mx9 zstd-win-binary-%PLATFORM%.zip zstd.exe && - appveyor PushArtifact zstd-win-binary-%PLATFORM%.zip && - cp zstd.exe ..\bin\zstd.exe && - git clone --depth 1 --branch release https://github.com/facebook/zstd && - cd zstd && - git archive --format=tar release -o zstd-src.tar && - ..\zstd -19 zstd-src.tar && - appveyor PushArtifact zstd-src.tar.zst && - certUtil -hashfile zstd-src.tar.zst SHA256 > zstd-src.tar.zst.sha256.sig && - appveyor PushArtifact zstd-src.tar.zst.sha256.sig && - cd ..\..\bin\ && - 7z a -tzip -mx9 zstd-win-release-%PLATFORM%.zip * && - appveyor PushArtifact zstd-win-release-%PLATFORM%.zip - ) - ) - - if [%HOST%]==[cmake-visual] ( - ECHO *** && - ECHO *** Building %CMAKE_GENERATOR% ^(%CMAKE_GENERATOR_TOOLSET%^) %PLATFORM%\%CONFIGURATION% && - PUSHD build\cmake && - cmake -DBUILD_TESTING=ON . && - cmake --build . --config %CONFIGURATION% -j4 && - POPD && - ECHO *** - ) - - test_script: - - ECHO Testing %COMPILER% %PLATFORM% %CONFIGURATION% - - SET "CC=gcc" - - SET "CXX=g++" - - if [%TEST%]==[cmake] ( - mkdir build\cmake\build && - cd build\cmake\build && - SET FUZZERTEST=-T2mn && - SET ZSTREAM_TESTTIME=-T2mn && - cmake -G "Visual Studio 14 2015 Win64" .. && - cd ..\..\.. && - make clean - ) - - -# The following tests are for regular pushes -# into `dev` or some feature branch -# There run less tests, for shorter feedback loop - -- - version: 1.0.{build} - environment: - matrix: - - COMPILER: "visual" - HOST: "visual" - PLATFORM: "x64" - CONFIGURATION: "Debug" - - COMPILER: "visual" - HOST: "visual" - PLATFORM: "Win32" - CONFIGURATION: "Debug" - - COMPILER: "visual" - HOST: "visual" - PLATFORM: "x64" - CONFIGURATION: "Release" - - COMPILER: "visual" - HOST: "visual" - PLATFORM: "Win32" - CONFIGURATION: "Release" - - - COMPILER: "gcc" - HOST: "cygwin" - PLATFORM: "x64" - - - COMPILER: "clang-cl" - HOST: "cmake-visual" - PLATFORM: "x64" - CONFIGURATION: "Release" - CMAKE_GENERATOR: "Visual Studio 15 2017" - CMAKE_GENERATOR_PLATFORM: "x64" - CMAKE_GENERATOR_TOOLSET: "LLVM" - APPVEYOR_BUILD_WORKER_IMAGE: "Visual Studio 2017" - - install: - - ECHO Installing %COMPILER% %PLATFORM% %CONFIGURATION% - - SET PATH_ORIGINAL=%PATH% - - if [%HOST%]==[cygwin] ( - ECHO Installing Cygwin Packages && - C:\cygwin64\setup-x86_64.exe -qnNdO -R "C:\cygwin64" -g -P ^ - gcc,^ - cmake,^ - make - ) - - IF [%HOST%]==[visual] IF [%PLATFORM%]==[x64] ( - SET ADDITIONALPARAM=/p:LibraryPath="C:\Program Files\Microsoft SDKs\Windows\v7.1\lib\x64;c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\lib\amd64;C:\Program Files (x86)\Microsoft Visual Studio 10.0\;C:\Program Files (x86)\Microsoft Visual Studio 10.0\lib\amd64;" - ) - - build_script: - - ECHO Building %COMPILER% %PLATFORM% %CONFIGURATION% - - if [%HOST%]==[cygwin] ( - set CHERE_INVOKING=yes && - set CC=%COMPILER% && - C:\cygwin64\bin\bash --login -c " - set -e; - cd build/cmake; - CFLAGS='-Werror' cmake -G 'Unix Makefiles' -DCMAKE_BUILD_TYPE=Debug -DZSTD_BUILD_TESTS:BOOL=ON -DZSTD_FUZZER_FLAGS=-T20s -DZSTD_ZSTREAM_FLAGS=-T20s -DZSTD_FULLBENCH_FLAGS=-i0 .; - make VERBOSE=1 -j; - ctest -V -L Medium; - " - ) - - if [%HOST%]==[cmake-visual] ( - ECHO *** && - ECHO *** Building %CMAKE_GENERATOR% ^(%CMAKE_GENERATOR_TOOLSET%^) %PLATFORM%\%CONFIGURATION% && - PUSHD build\cmake && - cmake -DBUILD_TESTING=ON . && - cmake --build . --config %CONFIGURATION% -j4 && - POPD && - ECHO *** - ) - - if [%HOST%]==[visual] ( - ECHO *** && - ECHO *** Building Visual Studio 2012 %PLATFORM%\%CONFIGURATION% && - ECHO *** && - msbuild "build\VS2010\zstd.sln" /m /verbosity:minimal /property:PlatformToolset=v110 /p:ForceImportBeforeCppTargets=%APPVEYOR_BUILD_FOLDER%\build\VS2010\CompileAsCpp.props /t:Clean,Build /p:Platform=%PLATFORM% /p:Configuration=%CONFIGURATION% /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" && - DIR build\VS2010\bin\%PLATFORM%_%CONFIGURATION%\*.exe && - msbuild "build\VS2010\zstd.sln" /m /verbosity:minimal /property:PlatformToolset=v110 /t:Clean,Build /p:Platform=%PLATFORM% /p:Configuration=%CONFIGURATION% /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" && - DIR build\VS2010\bin\%PLATFORM%_%CONFIGURATION%\*.exe - ) - - - test_script: - - ECHO Testing %COMPILER% %PLATFORM% %CONFIGURATION% - - SET "FUZZERTEST=-T10s" - - if [%HOST%]==[mingw] ( - set "CC=%COMPILER%" && - make clean && - make check - ) \ No newline at end of file diff --git a/third-party/zstd/build/VS2008/zstd/zstd.vcproj b/third-party/zstd/build/VS2008/zstd/zstd.vcproj index 91f2bda5..de1501d2 100644 --- a/third-party/zstd/build/VS2008/zstd/zstd.vcproj +++ b/third-party/zstd/build/VS2008/zstd/zstd.vcproj @@ -356,6 +356,10 @@ RelativePath="..\..\..\programs\dibio.c" > + + diff --git a/third-party/zstd/build/VS2010/datagen/datagen.vcxproj b/third-party/zstd/build/VS2010/datagen/datagen.vcxproj index a66358a0..aaba4788 100644 --- a/third-party/zstd/build/VS2010/datagen/datagen.vcxproj +++ b/third-party/zstd/build/VS2010/datagen/datagen.vcxproj @@ -157,6 +157,8 @@ + + diff --git a/third-party/zstd/build/VS2010/zstd/zstd.vcxproj b/third-party/zstd/build/VS2010/zstd/zstd.vcxproj index 5e1bced6..5a5237f0 100644 --- a/third-party/zstd/build/VS2010/zstd/zstd.vcxproj +++ b/third-party/zstd/build/VS2010/zstd/zstd.vcxproj @@ -63,6 +63,7 @@ + diff --git a/third-party/zstd/build/cmake/CMakeLists.txt b/third-party/zstd/build/cmake/CMakeLists.txt index 0bffc87d..399b818f 100644 --- a/third-party/zstd/build/cmake/CMakeLists.txt +++ b/third-party/zstd/build/cmake/CMakeLists.txt @@ -7,16 +7,14 @@ # in the COPYING file in the root directory of this source tree). # ################################################################ -cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR) +cmake_minimum_required(VERSION 3.5 FATAL_ERROR) # As of 2018-12-26 ZSTD has been validated to build with cmake version 3.13.2 new policies. # Set and use the newest cmake policies that are validated to work set(ZSTD_MAX_VALIDATED_CMAKE_MAJOR_VERSION "3") set(ZSTD_MAX_VALIDATED_CMAKE_MINOR_VERSION "13") #Policies never changed at PATCH level -if("${CMAKE_MAJOR_VERSION}" LESS 3) - set(ZSTD_CMAKE_POLICY_VERSION "${CMAKE_VERSION}") -elseif( "${ZSTD_MAX_VALIDATED_CMAKE_MAJOR_VERSION}" EQUAL "${CMAKE_MAJOR_VERSION}" AND - "${ZSTD_MAX_VALIDATED_CMAKE_MINOR_VERSION}" GREATER "${CMAKE_MINOR_VERSION}") +if("${ZSTD_MAX_VALIDATED_CMAKE_MAJOR_VERSION}" EQUAL "${CMAKE_MAJOR_VERSION}" AND + "${ZSTD_MAX_VALIDATED_CMAKE_MINOR_VERSION}" GREATER "${CMAKE_MINOR_VERSION}") set(ZSTD_CMAKE_POLICY_VERSION "${CMAKE_VERSION}") else() set(ZSTD_CMAKE_POLICY_VERSION "${ZSTD_MAX_VALIDATED_CMAKE_MAJOR_VERSION}.${ZSTD_MAX_VALIDATED_CMAKE_MINOR_VERSION}.0") @@ -32,24 +30,13 @@ set(LIBRARY_DIR ${ZSTD_SOURCE_DIR}/lib) include(GetZstdLibraryVersion) GetZstdLibraryVersion(${LIBRARY_DIR}/zstd.h zstd_VERSION_MAJOR zstd_VERSION_MINOR zstd_VERSION_PATCH) -if( CMAKE_MAJOR_VERSION LESS 3 ) - ## Provide cmake 3+ behavior for older versions of cmake - project(zstd) - set(PROJECT_VERSION_MAJOR ${zstd_VERSION_MAJOR}) - set(PROJECT_VERSION_MINOR ${zstd_VERSION_MINOR}) - set(PROJECT_VERSION_PATCH ${zstd_VERSION_PATCH}) - set(PROJECT_VERSION "${zstd_VERSION_MAJOR}.${zstd_VERSION_MINOR}.${zstd_VERSION_PATCH}") - enable_language(C) # Main library is in C - enable_language(ASM) # And ASM - enable_language(CXX) # Testing contributed code also utilizes CXX -else() - project(zstd - VERSION "${zstd_VERSION_MAJOR}.${zstd_VERSION_MINOR}.${zstd_VERSION_PATCH}" - LANGUAGES C # Main library is in C - ASM # And ASM - CXX # Testing contributed code also utilizes CXX - ) -endif() +project(zstd + VERSION "${zstd_VERSION_MAJOR}.${zstd_VERSION_MINOR}.${zstd_VERSION_PATCH}" + LANGUAGES C # Main library is in C + ASM # And ASM + CXX # Testing contributed code also utilizes CXX + ) + message(STATUS "ZSTD VERSION: ${zstd_VERSION}") set(zstd_HOMEPAGE_URL "https://facebook.github.io/zstd") set(zstd_DESCRIPTION "Zstandard is a real-time compression algorithm, providing high compression ratios.") @@ -127,10 +114,26 @@ endif () #----------------------------------------------------------------------------- # External dependencies #----------------------------------------------------------------------------- +# Define a function to handle special thread settings for HP-UX +# See https://github.com/facebook/zstd/pull/3862 for details. +function(setup_hpux_threads) + find_package(Threads) + if (NOT Threads_FOUND) + set(CMAKE_USE_PTHREADS_INIT 1 PARENT_SCOPE) + set(CMAKE_THREAD_LIBS_INIT -lpthread PARENT_SCOPE) + set(CMAKE_HAVE_THREADS_LIBRARY 1 PARENT_SCOPE) + set(Threads_FOUND TRUE PARENT_SCOPE) + endif() +endfunction() + if (ZSTD_MULTITHREAD_SUPPORT AND UNIX) - set(THREADS_PREFER_PTHREAD_FLAG ON) - find_package(Threads REQUIRED) - if(CMAKE_USE_PTHREADS_INIT) + if (CMAKE_SYSTEM_NAME MATCHES "HP-UX") + setup_hpux_threads() + else() + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + endif() + if (CMAKE_USE_PTHREADS_INIT) set(THREADS_LIBS "${CMAKE_THREAD_LIBS_INIT}") else() message(SEND_ERROR "ZSTD currently does not support thread libraries other than pthreads") @@ -193,10 +196,6 @@ export(EXPORT zstdExports FILE "${CMAKE_CURRENT_BINARY_DIR}/zstdTargets.cmake" NAMESPACE zstd:: ) -configure_file(zstdConfig.cmake - "${CMAKE_CURRENT_BINARY_DIR}/zstdConfig.cmake" - COPYONLY - ) # A Package Config file that works from the installation directory set(ConfigPackageLocation ${CMAKE_INSTALL_LIBDIR}/cmake/zstd) @@ -205,8 +204,13 @@ install(EXPORT zstdExports NAMESPACE zstd:: DESTINATION ${ConfigPackageLocation} ) +configure_package_config_file( + zstdConfig.cmake.in + "${CMAKE_CURRENT_BINARY_DIR}/zstdConfig.cmake" + INSTALL_DESTINATION ${ConfigPackageLocation} +) install(FILES - zstdConfig.cmake + "${CMAKE_CURRENT_BINARY_DIR}/zstdConfig.cmake" "${CMAKE_CURRENT_BINARY_DIR}/zstdConfigVersion.cmake" DESTINATION ${ConfigPackageLocation} ) diff --git a/third-party/zstd/build/cmake/README.md b/third-party/zstd/build/cmake/README.md index a460dd16..4c9d3a08 100644 --- a/third-party/zstd/build/cmake/README.md +++ b/third-party/zstd/build/cmake/README.md @@ -41,6 +41,38 @@ cmake -DZSTD_BUILD_TESTS=ON -DZSTD_LEGACY_SUPPORT=OFF .. make ``` +### how to use it with CMake FetchContent + +For all options available, you can see it on +```cmake +include(FetchContent) + +set(ZSTD_BUILD_STATIC ON) +set(ZSTD_BUILD_SHARED OFF) + +FetchContent_Declare( + zstd + URL "https://github.com/facebook/zstd/releases/download/v1.5.5/zstd-1.5.5.tar.gz" + DOWNLOAD_EXTRACT_TIMESTAMP TRUE + SOURCE_SUBDIR build/cmake +) + +FetchContent_MakeAvailable(zstd) + +target_link_libraries( + ${PROJECT_NAME} + PRIVATE + libzstd_static +) + +# On windows and macos this is needed +target_include_directories( + ${PROJECT_NAME} + PRIVATE + ${zstd_SOURCE_DIR}/lib +) +``` + ### referring [Looking for a 'cmake clean' command to clear up CMake output](https://stackoverflow.com/questions/9680420/looking-for-a-cmake-clean-command-to-clear-up-cmake-output) diff --git a/third-party/zstd/build/cmake/contrib/pzstd/CMakeLists.txt b/third-party/zstd/build/cmake/contrib/pzstd/CMakeLists.txt index f7098fa0..e1c8e067 100644 --- a/third-party/zstd/build/cmake/contrib/pzstd/CMakeLists.txt +++ b/third-party/zstd/build/cmake/contrib/pzstd/CMakeLists.txt @@ -18,6 +18,7 @@ set(PZSTD_DIR ${ZSTD_SOURCE_DIR}/contrib/pzstd) include_directories(${PROGRAMS_DIR} ${LIBRARY_DIR} ${LIBRARY_DIR}/common ${PZSTD_DIR}) add_executable(pzstd ${PROGRAMS_DIR}/util.c ${PZSTD_DIR}/main.cpp ${PZSTD_DIR}/Options.cpp ${PZSTD_DIR}/Pzstd.cpp ${PZSTD_DIR}/SkippableFrame.cpp) +target_compile_features(pzstd PRIVATE cxx_std_11) set_property(TARGET pzstd APPEND PROPERTY COMPILE_DEFINITIONS "NDEBUG") set_property(TARGET pzstd APPEND PROPERTY COMPILE_OPTIONS "-Wno-shadow") diff --git a/third-party/zstd/build/cmake/lib/CMakeLists.txt b/third-party/zstd/build/cmake/lib/CMakeLists.txt index 30349586..5d514ccb 100644 --- a/third-party/zstd/build/cmake/lib/CMakeLists.txt +++ b/third-party/zstd/build/cmake/lib/CMakeLists.txt @@ -12,45 +12,70 @@ project(libzstd C ASM) set(CMAKE_INCLUDE_CURRENT_DIR TRUE) option(ZSTD_BUILD_STATIC "BUILD STATIC LIBRARIES" ON) option(ZSTD_BUILD_SHARED "BUILD SHARED LIBRARIES" ON) +option(ZSTD_BUILD_COMPRESSION "BUILD COMPRESSION MODULE" ON) +option(ZSTD_BUILD_DECOMPRESSION "BUILD DECOMPRESSION MODULE" ON) +option(ZSTD_BUILD_DICTBUILDER "BUILD DICTBUILDER MODULE" ON) +option(ZSTD_BUILD_DEPRECATED "BUILD DEPRECATED MODULE" OFF) + +set(ZSTDLIB_VISIBLE "" CACHE STRING "Visibility for ZSTDLIB API") +set(ZSTDERRORLIB_VISIBLE "" CACHE STRING "Visibility for ZSTDERRORLIB_VISIBLE API") +set(ZDICTLIB_VISIBLE "" CACHE STRING "Visibility for ZDICTLIB_VISIBLE API") +set(ZSTDLIB_STATIC_API "" CACHE STRING "Visibility for ZSTDLIB_STATIC_API API") +set(ZDICTLIB_STATIC_API "" CACHE STRING "Visibility for ZDICTLIB_STATIC_API API") + +set_property(CACHE ZSTDLIB_VISIBLE PROPERTY STRINGS "" "hidden" "default" "protected" "internal") +set_property(CACHE ZSTDERRORLIB_VISIBLE PROPERTY STRINGS "" "hidden" "default" "protected" "internal") +set_property(CACHE ZDICTLIB_VISIBLE PROPERTY STRINGS "" "hidden" "default" "protected" "internal") +set_property(CACHE ZSTDLIB_STATIC_API PROPERTY STRINGS "" "hidden" "default" "protected" "internal") +set_property(CACHE ZDICTLIB_STATIC_API PROPERTY STRINGS "" "hidden" "default" "protected" "internal") if(NOT ZSTD_BUILD_SHARED AND NOT ZSTD_BUILD_STATIC) message(SEND_ERROR "You need to build at least one flavor of libzstd") endif() -# Define library directory, where sources and header files are located -include_directories(${LIBRARY_DIR} ${LIBRARY_DIR}/common) - file(GLOB CommonSources ${LIBRARY_DIR}/common/*.c) file(GLOB CompressSources ${LIBRARY_DIR}/compress/*.c) +file(GLOB DecompressSources ${LIBRARY_DIR}/decompress/*.c) if (MSVC) - file(GLOB DecompressSources ${LIBRARY_DIR}/decompress/*.c) add_compile_options(-DZSTD_DISABLE_ASM) else () - file(GLOB DecompressSources ${LIBRARY_DIR}/decompress/*.c ${LIBRARY_DIR}/decompress/*.S) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|AMD64.*|x86_64.*|X86_64.*") + set(DecompressSources ${DecompressSources} ${LIBRARY_DIR}/decompress/huf_decompress_amd64.S) + else() + add_compile_options(-DZSTD_DISABLE_ASM) + endif() endif () file(GLOB DictBuilderSources ${LIBRARY_DIR}/dictBuilder/*.c) +file(GLOB DeprecatedSources ${LIBRARY_DIR}/deprecated/*.c) -set(Sources - ${CommonSources} - ${CompressSources} - ${DecompressSources} - ${DictBuilderSources}) - +file(GLOB PublicHeaders ${LIBRARY_DIR}/*.h) file(GLOB CommonHeaders ${LIBRARY_DIR}/common/*.h) file(GLOB CompressHeaders ${LIBRARY_DIR}/compress/*.h) file(GLOB DecompressHeaders ${LIBRARY_DIR}/decompress/*.h) file(GLOB DictBuilderHeaders ${LIBRARY_DIR}/dictBuilder/*.h) +file(GLOB DeprecatedHeaders ${LIBRARY_DIR}/deprecated/*.h) -set(Headers - ${LIBRARY_DIR}/zstd.h - ${CommonHeaders} - ${CompressHeaders} - ${DecompressHeaders} - ${DictBuilderHeaders}) +set(Sources ${CommonSources}) +set(Headers ${PublicHeaders} ${CommonHeaders}) +if (ZSTD_BUILD_COMPRESSION) + set(Sources ${Sources} ${CompressSources}) + set(Headers ${Headers} ${CompressHeaders}) +endif() +if (ZSTD_BUILD_DECOMPRESSION) + set(Sources ${Sources} ${DecompressSources}) + set(Headers ${Headers} ${DecompressHeaders}) +endif() +if (ZSTD_BUILD_DICTBUILDER) + set(Sources ${Sources} ${DictBuilderSources}) + set(Headers ${Headers} ${DictBuilderHeaders}) +endif() +if (ZSTD_BUILD_DEPRECATED) + set(Sources ${Sources} ${DeprecatedSources}) + set(Headers ${Headers} ${DeprecatedHeaders}) +endif() if (ZSTD_LEGACY_SUPPORT) set(LIBRARY_LEGACY_DIR ${LIBRARY_DIR}/legacy) - include_directories(${LIBRARY_LEGACY_DIR}) set(Sources ${Sources} ${LIBRARY_LEGACY_DIR}/zstd_v01.c @@ -81,22 +106,38 @@ endif () # Our assembly expects to be compiled by a C compiler, and is only enabled for # __GNUC__ compatible compilers. Otherwise all the ASM code is disabled by # macros. -set_source_files_properties(${Sources} PROPERTIES LANGUAGE C) +if(NOT CMAKE_ASM_COMPILER STREQUAL CMAKE_C_COMPILER) + set_source_files_properties(${Sources} PROPERTIES LANGUAGE C) +endif() + +macro (add_definition target var) + if (NOT ("${${var}}" STREQUAL "")) + set_property(TARGET ${target} APPEND PROPERTY COMPILE_DEFINITIONS "${var}=__attribute__((visibility(\"${${var}}\")))") + endif () +endmacro () + +# Define directories containing the library's public headers +set(PUBLIC_INCLUDE_DIRS ${LIBRARY_DIR}) # Split project to static and shared libraries build set(library_targets) if (ZSTD_BUILD_SHARED) add_library(libzstd_shared SHARED ${Sources} ${Headers} ${PlatformDependResources}) + target_include_directories(libzstd_shared INTERFACE $) list(APPEND library_targets libzstd_shared) if (ZSTD_MULTITHREAD_SUPPORT) set_property(TARGET libzstd_shared APPEND PROPERTY COMPILE_DEFINITIONS "ZSTD_MULTITHREAD") if (UNIX) target_link_libraries(libzstd_shared ${THREADS_LIBS}) endif () - endif() + endif () + add_definition(libzstd_shared ZSTDLIB_VISIBLE) + add_definition(libzstd_shared ZSTDERRORLIB_VISIBLE) + add_definition(libzstd_shared ZDICTLIB_VISIBLE) endif () if (ZSTD_BUILD_STATIC) add_library(libzstd_static STATIC ${Sources} ${Headers}) + target_include_directories(libzstd_static INTERFACE $) list(APPEND library_targets libzstd_static) if (ZSTD_MULTITHREAD_SUPPORT) set_property(TARGET libzstd_static APPEND PROPERTY COMPILE_DEFINITIONS "ZSTD_MULTITHREAD") @@ -104,6 +145,41 @@ if (ZSTD_BUILD_STATIC) target_link_libraries(libzstd_static ${THREADS_LIBS}) endif () endif () + add_definition(libzstd_static ZSTDLIB_VISIBLE) + add_definition(libzstd_static ZSTDERRORLIB_VISIBLE) + add_definition(libzstd_static ZDICTLIB_VISIBLE) + add_definition(libzstd_static ZSTDLIB_STATIC_API) + add_definition(libzstd_static ZDICTLIB_STATIC_API) +endif () +if (ZSTD_BUILD_SHARED AND NOT ZSTD_BUILD_STATIC) + if (NOT BUILD_SHARED_LIBS) + message(WARNING "BUILD_SHARED_LIBS is OFF, but ZSTD_BUILD_SHARED is ON and ZSTD_BUILD_STATIC is OFF, which takes precedence, so libzstd is a shared library") + endif () + add_library(libzstd INTERFACE) + target_link_libraries(libzstd INTERFACE libzstd_shared) + list(APPEND library_targets libzstd) +endif () +if (ZSTD_BUILD_STATIC AND NOT ZSTD_BUILD_SHARED) + if (BUILD_SHARED_LIBS) + message(WARNING "BUILD_SHARED_LIBS is ON, but ZSTD_BUILD_SHARED is OFF and ZSTD_BUILD_STATIC is ON, which takes precedence, is set so libzstd is a static library") + endif () + add_library(libzstd INTERFACE) + target_link_libraries(libzstd INTERFACE libzstd_static) + list(APPEND library_targets libzstd) +endif () +if (ZSTD_BUILD_SHARED AND ZSTD_BUILD_STATIC) + # If both ZSTD_BUILD_SHARED and ZSTD_BUILD_STATIC are set, which is the + # default, fallback to using BUILD_SHARED_LIBS to determine whether to + # set libzstd to static or shared. + if (BUILD_SHARED_LIBS) + add_library(libzstd INTERFACE) + target_link_libraries(libzstd INTERFACE libzstd_shared) + list(APPEND library_targets libzstd) + else () + add_library(libzstd INTERFACE) + target_link_libraries(libzstd INTERFACE libzstd_static) + list(APPEND library_targets libzstd) + endif () endif () # Add specific compile definitions for MSVC project @@ -154,11 +230,7 @@ configure_file("${LIBRARY_DIR}/libzstd.pc.in" "${CMAKE_CURRENT_BINARY_DIR}/libzs install(FILES "${CMAKE_CURRENT_BINARY_DIR}/libzstd.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") # install target -install(FILES - "${LIBRARY_DIR}/zstd.h" - "${LIBRARY_DIR}/zdict.h" - "${LIBRARY_DIR}/zstd_errors.h" - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") +install(FILES ${PublicHeaders} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") install(TARGETS ${library_targets} EXPORT zstdExports diff --git a/third-party/zstd/build/cmake/programs/CMakeLists.txt b/third-party/zstd/build/cmake/programs/CMakeLists.txt index 58d998e4..5e239e32 100644 --- a/third-party/zstd/build/cmake/programs/CMakeLists.txt +++ b/third-party/zstd/build/cmake/programs/CMakeLists.txt @@ -32,7 +32,12 @@ if (MSVC) set(PlatformDependResources ${MSVC_RESOURCE_DIR}/zstd.rc) endif () -add_executable(zstd ${PROGRAMS_DIR}/zstdcli.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${PROGRAMS_DIR}/fileio.c ${PROGRAMS_DIR}/fileio_asyncio.c ${PROGRAMS_DIR}/benchfn.c ${PROGRAMS_DIR}/benchzstd.c ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/dibio.c ${PROGRAMS_DIR}/zstdcli_trace.c ${PlatformDependResources}) +file(GLOB ZSTD_PROGRAM_SRCS "${PROGRAMS_DIR}/*.c") +if (MSVC AND ZSTD_PROGRAMS_LINK_SHARED) + list(APPEND ZSTD_PROGRAM_SRCS ${LIBRARY_DIR}/common/pool.c ${LIBRARY_DIR}/common/threading.c) +endif () + +add_executable(zstd ${ZSTD_PROGRAM_SRCS}) target_link_libraries(zstd ${PROGRAMS_ZSTD_LINK_TARGET}) if (CMAKE_SYSTEM_NAME MATCHES "(Solaris|SunOS)") target_link_libraries(zstd rt) @@ -75,7 +80,9 @@ if (UNIX) ${CMAKE_CURRENT_BINARY_DIR}/zstdless.1 DESTINATION "${MAN_INSTALL_DIR}") - add_executable(zstd-frugal ${PROGRAMS_DIR}/zstdcli.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${PROGRAMS_DIR}/fileio.c ${PROGRAMS_DIR}/fileio_asyncio.c) + add_executable(zstd-frugal ${PROGRAMS_DIR}/zstdcli.c + ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c + ${PROGRAMS_DIR}/fileio.c ${PROGRAMS_DIR}/fileio_asyncio.c) target_link_libraries(zstd-frugal ${PROGRAMS_ZSTD_LINK_TARGET}) set_property(TARGET zstd-frugal APPEND PROPERTY COMPILE_DEFINITIONS "ZSTD_NOBENCH;ZSTD_NODICT;ZSTD_NOTRACE") endif () diff --git a/third-party/zstd/build/cmake/tests/CMakeLists.txt b/third-party/zstd/build/cmake/tests/CMakeLists.txt index 250f0508..56104a4e 100644 --- a/third-party/zstd/build/cmake/tests/CMakeLists.txt +++ b/third-party/zstd/build/cmake/tests/CMakeLists.txt @@ -50,18 +50,18 @@ set(PROGRAMS_DIR ${ZSTD_SOURCE_DIR}/programs) set(TESTS_DIR ${ZSTD_SOURCE_DIR}/tests) include_directories(${TESTS_DIR} ${PROGRAMS_DIR} ${LIBRARY_DIR} ${LIBRARY_DIR}/common ${LIBRARY_DIR}/compress ${LIBRARY_DIR}/dictBuilder) -add_executable(datagen ${PROGRAMS_DIR}/datagen.c ${TESTS_DIR}/datagencli.c) +add_executable(datagen ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/lorem.c ${TESTS_DIR}/loremOut.c ${TESTS_DIR}/datagencli.c) target_link_libraries(datagen libzstd_static) # # fullbench # -add_executable(fullbench ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${PROGRAMS_DIR}/benchfn.c ${PROGRAMS_DIR}/benchzstd.c ${TESTS_DIR}/fullbench.c) +add_executable(fullbench ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/lorem.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${PROGRAMS_DIR}/benchfn.c ${PROGRAMS_DIR}/benchzstd.c ${TESTS_DIR}/fullbench.c) if (NOT MSVC) target_compile_options(fullbench PRIVATE "-Wno-deprecated-declarations") endif() target_link_libraries(fullbench libzstd_static) -add_test(NAME fullbench COMMAND fullbench ${ZSTD_FULLBENCH_FLAGS}) +add_test(NAME fullbench COMMAND "$" ${ZSTD_FULLBENCH_FLAGS}) # # fuzzer @@ -73,7 +73,7 @@ endif() target_link_libraries(fuzzer libzstd_static) AddTestFlagsOption(ZSTD_FUZZER_FLAGS "$ENV{FUZZERTEST} $ENV{FUZZER_FLAGS}" "Semicolon-separated list of flags to pass to the fuzzer test (see `fuzzer -h` for usage)") -add_test(NAME fuzzer COMMAND fuzzer ${ZSTD_FUZZER_FLAGS}) +add_test(NAME fuzzer COMMAND "$" ${ZSTD_FUZZER_FLAGS}) # Disable the timeout since the run time is too long for the default timeout of # 1500 seconds and varies considerably between low-end and high-end CPUs. # set_tests_properties(fuzzer PROPERTIES TIMEOUT 0) @@ -88,7 +88,7 @@ endif() target_link_libraries(zstreamtest libzstd_static) AddTestFlagsOption(ZSTD_ZSTREAM_FLAGS "$ENV{ZSTREAM_TESTTIME} $ENV{FUZZER_FLAGS}" "Semicolon-separated list of flags to pass to the zstreamtest test (see `zstreamtest -h` for usage)") -add_test(NAME zstreamtest COMMAND zstreamtest ${ZSTD_ZSTREAM_FLAGS}) +add_test(NAME zstreamtest COMMAND "$" ${ZSTD_ZSTREAM_FLAGS}) # # playTests.sh @@ -110,7 +110,7 @@ endif() # Label the "Medium" set of tests (see TESTING.md) set_property(TEST fuzzer zstreamtest playTests APPEND PROPERTY LABELS Medium) -add_executable(paramgrill ${PROGRAMS_DIR}/benchfn.c ${PROGRAMS_DIR}/benchzstd.c ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/paramgrill.c) +add_executable(paramgrill ${PROGRAMS_DIR}/benchfn.c ${PROGRAMS_DIR}/benchzstd.c ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/lorem.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/paramgrill.c) if (UNIX) target_link_libraries(paramgrill libzstd_static m) #m is math library else() diff --git a/third-party/zstd/build/cmake/zstdConfig.cmake b/third-party/zstd/build/cmake/zstdConfig.cmake deleted file mode 100644 index ebbfcc38..00000000 --- a/third-party/zstd/build/cmake/zstdConfig.cmake +++ /dev/null @@ -1 +0,0 @@ -include("${CMAKE_CURRENT_LIST_DIR}/zstdTargets.cmake") diff --git a/third-party/zstd/build/cmake/zstdConfig.cmake.in b/third-party/zstd/build/cmake/zstdConfig.cmake.in new file mode 100644 index 00000000..f4190f98 --- /dev/null +++ b/third-party/zstd/build/cmake/zstdConfig.cmake.in @@ -0,0 +1,10 @@ +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) +if(@ZSTD_MULTITHREAD_SUPPORT@ AND "@UNIX@") + find_dependency(Threads) +endif() + +include("${CMAKE_CURRENT_LIST_DIR}/zstdTargets.cmake") + +check_required_components("zstd") diff --git a/third-party/zstd/build/meson/programs/meson.build b/third-party/zstd/build/meson/programs/meson.build index 0b5a9305..e103a629 100644 --- a/third-party/zstd/build/meson/programs/meson.build +++ b/third-party/zstd/build/meson/programs/meson.build @@ -18,6 +18,7 @@ zstd_programs_sources = [join_paths(zstd_rootdir, 'programs/zstdcli.c'), join_paths(zstd_rootdir, 'programs/benchfn.c'), join_paths(zstd_rootdir, 'programs/benchzstd.c'), join_paths(zstd_rootdir, 'programs/datagen.c'), + join_paths(zstd_rootdir, 'programs/lorem.c'), join_paths(zstd_rootdir, 'programs/dibio.c'), join_paths(zstd_rootdir, 'programs/zstdcli_trace.c')] diff --git a/third-party/zstd/build/meson/tests/meson.build b/third-party/zstd/build/meson/tests/meson.build index 2dd8d106..9847ab03 100644 --- a/third-party/zstd/build/meson/tests/meson.build +++ b/third-party/zstd/build/meson/tests/meson.build @@ -29,6 +29,7 @@ DECODECORPUS_TESTTIME = '-T30' test_includes = [ include_directories(join_paths(zstd_rootdir, 'programs')) ] testcommon_sources = [join_paths(zstd_rootdir, 'programs/datagen.c'), + join_paths(zstd_rootdir, 'programs/lorem.c'), join_paths(zstd_rootdir, 'programs/util.c'), join_paths(zstd_rootdir, 'programs/timefn.c'), join_paths(zstd_rootdir, 'programs/benchfn.c'), @@ -43,7 +44,8 @@ testcommon_dep = declare_dependency(link_with: testcommon, dependencies: libzstd_deps, include_directories: libzstd_includes) -datagen_sources = [join_paths(zstd_rootdir, 'tests/datagencli.c')] +datagen_sources = [join_paths(zstd_rootdir, 'tests/datagencli.c'), + join_paths(zstd_rootdir, 'tests/loremOut.c')] datagen = executable('datagen', datagen_sources, c_args: [ '-DNDEBUG' ], diff --git a/third-party/zstd/contrib/linux-kernel/mem.h b/third-party/zstd/contrib/linux-kernel/mem.h index a7231822..2e91e778 100644 --- a/third-party/zstd/contrib/linux-kernel/mem.h +++ b/third-party/zstd/contrib/linux-kernel/mem.h @@ -24,6 +24,7 @@ /*-**************************************** * Compiler specifics ******************************************/ +#undef MEM_STATIC /* may be already defined from common/compiler.h */ #define MEM_STATIC static inline /*-************************************************************** diff --git a/third-party/zstd/contrib/linux-kernel/zstd_decompress_module.c b/third-party/zstd/contrib/linux-kernel/zstd_decompress_module.c index eb1c49e6..7d31518e 100644 --- a/third-party/zstd/contrib/linux-kernel/zstd_decompress_module.c +++ b/third-party/zstd/contrib/linux-kernel/zstd_decompress_module.c @@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream); size_t zstd_reset_dstream(zstd_dstream *dstream) { - return ZSTD_resetDStream(dstream); + return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only); } EXPORT_SYMBOL(zstd_reset_dstream); diff --git a/third-party/zstd/contrib/linux-kernel/zstd_deps.h b/third-party/zstd/contrib/linux-kernel/zstd_deps.h index 670c5fa2..f931f7d0 100644 --- a/third-party/zstd/contrib/linux-kernel/zstd_deps.h +++ b/third-party/zstd/contrib/linux-kernel/zstd_deps.h @@ -115,11 +115,7 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) { #ifndef ZSTD_DEPS_STDINT #define ZSTD_DEPS_STDINT -/* - * The Linux Kernel doesn't provide intptr_t, only uintptr_t, which - * is an unsigned long. - */ -typedef long intptr_t; +/* intptr_t already provided by ZSTD_DEPS_COMMON */ #endif /* ZSTD_DEPS_STDINT */ #endif /* ZSTD_DEPS_NEED_STDINT */ diff --git a/third-party/zstd/contrib/pzstd/Makefile b/third-party/zstd/contrib/pzstd/Makefile index e62f8e87..e4b3e8a2 100644 --- a/third-party/zstd/contrib/pzstd/Makefile +++ b/third-party/zstd/contrib/pzstd/Makefile @@ -10,7 +10,7 @@ # Standard variables for installation DESTDIR ?= PREFIX ?= /usr/local -BINDIR := $(DESTDIR)$(PREFIX)/bin +BINDIR := $(PREFIX)/bin ZSTDDIR = ../../lib PROGDIR = ../../programs @@ -37,11 +37,8 @@ CFLAGS += -Wno-deprecated-declarations PZSTD_INC = -I$(ZSTDDIR) -I$(ZSTDDIR)/common -I$(PROGDIR) -I. GTEST_INC = -isystem googletest/googletest/include -# If default C++ version is older than C++11, explicitly set C++11, which is the -# minimum required by the code. -ifeq ($(shell echo "\043if __cplusplus < 201103L\n\043error\n\043endif" | $(CXX) -x c++ -Werror -c - -o /dev/null 2>/dev/null && echo 1 || echo 0),0) -PZSTD_CXX_STD := -std=c++11 -endif +# Set the minimum required by gtest +PZSTD_CXX_STD := -std=c++14 PZSTD_CPPFLAGS = $(PZSTD_INC) PZSTD_CCXXFLAGS = @@ -112,12 +109,12 @@ check: .PHONY: install install: PZSTD_CPPFLAGS += -DNDEBUG install: pzstd$(EXT) - install -d -m 755 $(BINDIR)/ - install -m 755 pzstd$(EXT) $(BINDIR)/pzstd$(EXT) + install -d -m 755 $(DESTDIR)$(BINDIR)/ + install -m 755 pzstd$(EXT) $(DESTDIR)$(BINDIR)/pzstd$(EXT) .PHONY: uninstall uninstall: - $(RM) $(BINDIR)/pzstd$(EXT) + $(RM) $(DESTDIR)$(BINDIR)/pzstd$(EXT) # Targets for many different builds .PHONY: all diff --git a/third-party/zstd/contrib/seekable_format/examples/parallel_processing.c b/third-party/zstd/contrib/seekable_format/examples/parallel_processing.c index 356561e5..92837102 100644 --- a/third-party/zstd/contrib/seekable_format/examples/parallel_processing.c +++ b/third-party/zstd/contrib/seekable_format/examples/parallel_processing.c @@ -19,7 +19,7 @@ #define ZSTD_STATIC_LINKING_ONLY #include // presumes zstd library is installed #include -#if defined(WIN32) || defined(_WIN32) +#if defined(_WIN32) # include # define SLEEP(x) Sleep(x) #else diff --git a/third-party/zstd/doc/decompressor_errata.md b/third-party/zstd/doc/decompressor_errata.md index b162e7fd..b570f731 100644 --- a/third-party/zstd/doc/decompressor_errata.md +++ b/third-party/zstd/doc/decompressor_errata.md @@ -6,12 +6,53 @@ Each entry will contain: 1. The last affected decompressor versions. 2. The decompressor components affected. 2. Whether the compressed frame could ever be produced by the reference compressor. -3. An example frame. +3. An example frame (hexadecimal string when it can be short enough, link to golden file otherwise) 4. A description of the bug. The document is in reverse chronological order, with the bugs that affect the most recent zstd decompressor versions listed first. +No sequence using the 2-bytes format +------------------------------------------------ + +**Last affected version**: v1.5.5 + +**Affected decompressor component(s)**: Library & CLI + +**Produced by the reference compressor**: No + +**Example Frame**: see zstd/tests/golden-decompression/zeroSeq_2B.zst + +The zstd decoder incorrectly expects FSE tables when there are 0 sequences present in the block +if the value 0 is encoded using the 2-bytes format. +Instead, it should immediately end the sequence section, and move on to next block. + +This situation was never generated by the reference compressor, +because representing 0 sequences with the 2-bytes format is inefficient +(the 1-byte format is always used in this case). + + +Compressed block with a size of exactly 128 KB +------------------------------------------------ + +**Last affected version**: v1.5.2 + +**Affected decompressor component(s)**: Library & CLI + +**Produced by the reference compressor**: No + +**Example Frame**: see zstd/tests/golden-decompression/block-128k.zst + +The zstd decoder incorrectly rejected blocks of type `Compressed_Block` when their size was exactly 128 KB. +Note that `128 KB - 1` was accepted, and `128 KB + 1` is forbidden by the spec. + +This type of block was never generated by the reference compressor. + +These blocks used to be disallowed by the spec up until spec version 0.3.2 when the restriction was lifted by [PR#1689](https://github.com/facebook/zstd/pull/1689). + +> A Compressed_Block has the extra restriction that Block_Size is always strictly less than the decompressed size. If this condition cannot be respected, the block must be sent uncompressed instead (Raw_Block). + + Compressed block with 0 literals and 0 sequences ------------------------------------------------ @@ -31,6 +72,7 @@ Additionally, these blocks were disallowed by the spec up until spec version 0.3 > A Compressed_Block has the extra restriction that Block_Size is always strictly less than the decompressed size. If this condition cannot be respected, the block must be sent uncompressed instead (Raw_Block). + First block is RLE block ------------------------ @@ -52,6 +94,7 @@ block. https://github.com/facebook/zstd/blob/8814aa5bfa74f05a86e55e9d508da177a893ceeb/lib/compress/zstd_compress.c#L3527-L3535 + Tiny FSE Table & Block ---------------------- @@ -82,3 +125,24 @@ The total `Block_Content` is `5` bytes, and `Last_Table_Offset` is `2`. See the compressor workaround code: https://github.com/facebook/zstd/blob/8814aa5bfa74f05a86e55e9d508da177a893ceeb/lib/compress/zstd_compress.c#L2667-L2682 + +Magicless format +---------------------- + +**Last affected version**: v1.5.5 + +**Affected decompressor component(s)**: Library + +**Produced by the reference compressor**: Yes (example: https://gist.github.com/embg/9940726094f4cf2cef162cffe9319232) + +**Example Frame**: `27 b5 2f fd 00 03 19 00 00 66 6f 6f 3f ba c4 59` + +v1.5.6 fixes several bugs in which the magicless-format decoder rejects valid frames. +These include but are not limited to: +* Valid frames that happen to begin with a legacy magic number (little-endian) +* Valid frames that happen to begin with a skippable magic number (little-endian) + +If you are affected by this issue and cannot update to v1.5.6 or later, there is a +workaround to recover affected data. Simply prepend the ZSTD magic number +`0xFD2FB528` (little-endian) to your data and decompress using the standard-format +decoder. diff --git a/third-party/zstd/doc/decompressor_permissive.md b/third-party/zstd/doc/decompressor_permissive.md new file mode 100644 index 00000000..bd77165f --- /dev/null +++ b/third-party/zstd/doc/decompressor_permissive.md @@ -0,0 +1,60 @@ +Decompressor Permissiveness to Invalid Data +=========================================== + +This document describes the behavior of the reference decompressor in cases +where it accepts formally invalid data instead of reporting an error. + +While the reference decompressor *must* decode any compliant frame following +the specification, its ability to detect erroneous data is on a best effort +basis: the decoder may accept input data that would be formally invalid, +when it causes no risk to the decoder, and which detection would cost too much +complexity or speed regression. + +In practice, the vast majority of invalid data are detected, if only because +many corruption events are dangerous for the decoder process (such as +requesting an out-of-bound memory access) and many more are easy to check. + +This document lists a few known cases where invalid data was formerly accepted +by the decoder, and what has changed since. + + +Offset == 0 +----------- + +**Last affected version**: v1.5.5 + +**Produced by the reference compressor**: No + +**Example Frame**: `28b5 2ffd 0000 4500 0008 0002 002f 430b ae` + +If a sequence is decoded with `literals_length = 0` and `offset_value = 3` +while `Repeated_Offset_1 = 1`, the computed offset will be `0`, which is +invalid. + +The reference decompressor up to v1.5.5 processes this case as if the computed +offset was `1`, including inserting `1` into the repeated offset list. +This prevents the output buffer from remaining uninitialized, thus denying a +potential attack vector from an untrusted source. +However, in the rare case where this scenario would be the outcome of a +transmission or storage error, the decoder relies on the checksum to detect +the error. + +In newer versions, this case is always detected and reported as a corruption error. + + +Non-zeroes reserved bits +------------------------ + +**Last affected version**: v1.5.5 + +**Produced by the reference compressor**: No + +The Sequences section of each block has a header, and one of its elements is a +byte, which describes the compression mode of each symbol. +This byte contains 2 reserved bits which must be set to zero. + +The reference decompressor up to v1.5.5 just ignores these 2 bits. +This behavior has no consequence for the rest of the frame decoding process. + +In newer versions, the 2 reserved bits are actively checked for value zero, +and the decoder reports a corruption error if they are not. diff --git a/third-party/zstd/doc/educational_decoder/zstd_decompress.c b/third-party/zstd/doc/educational_decoder/zstd_decompress.c index 9ade7650..839e085b 100644 --- a/third-party/zstd/doc/educational_decoder/zstd_decompress.c +++ b/third-party/zstd/doc/educational_decoder/zstd_decompress.c @@ -997,7 +997,8 @@ static void decompress_sequences(frame_context_t *const ctx, const size_t num_sequences); static sequence_command_t decode_sequence(sequence_states_t *const state, const u8 *const src, - i64 *const offset); + i64 *const offset, + int lastSequence); static void decode_seq_table(FSE_dtable *const table, istream_t *const in, const seq_part_t type, const seq_mode_t mode); @@ -1017,12 +1018,7 @@ static size_t decode_sequences(frame_context_t *const ctx, istream_t *in, // This is a variable size field using between 1 and 3 bytes. Let's call its // first byte byte0." u8 header = IO_read_bits(in, 8); - if (header == 0) { - // "There are no sequences. The sequence section stops there. - // Regenerated content is defined entirely by literals section." - *sequences = NULL; - return 0; - } else if (header < 128) { + if (header < 128) { // "Number_of_Sequences = byte0 . Uses 1 byte." num_sequences = header; } else if (header < 255) { @@ -1033,6 +1029,12 @@ static size_t decode_sequences(frame_context_t *const ctx, istream_t *in, num_sequences = IO_read_bits(in, 16) + 0x7F00; } + if (num_sequences == 0) { + // "There are no sequences. The sequence section stops there." + *sequences = NULL; + return 0; + } + *sequences = malloc(num_sequences * sizeof(sequence_command_t)); if (!*sequences) { BAD_ALLOC(); @@ -1114,7 +1116,7 @@ static void decompress_sequences(frame_context_t *const ctx, istream_t *in, for (size_t i = 0; i < num_sequences; i++) { // Decode sequences one by one - sequences[i] = decode_sequence(&states, src, &bit_offset); + sequences[i] = decode_sequence(&states, src, &bit_offset, i==num_sequences-1); } if (bit_offset != 0) { @@ -1125,7 +1127,8 @@ static void decompress_sequences(frame_context_t *const ctx, istream_t *in, // Decode a single sequence and update the state static sequence_command_t decode_sequence(sequence_states_t *const states, const u8 *const src, - i64 *const offset) { + i64 *const offset, + int lastSequence) { // "Each symbol is a code in its own context, which specifies Baseline and // Number_of_Bits to add. Codes are FSE compressed, and interleaved with raw // additional bits in the same bitstream." @@ -1160,7 +1163,7 @@ static sequence_command_t decode_sequence(sequence_states_t *const states, // Literals_Length_State is updated, followed by Match_Length_State, and // then Offset_State." // If the stream is complete don't read bits to update state - if (*offset != 0) { + if (!lastSequence) { FSE_update_state(&states->ll_table, &states->ll_state, src, offset); FSE_update_state(&states->ml_table, &states->ml_state, src, offset); FSE_update_state(&states->of_table, &states->of_state, src, offset); @@ -1210,7 +1213,7 @@ static void decode_seq_table(FSE_dtable *const table, istream_t *const in, break; } case seq_repeat: - // "Repeat_Mode : re-use distribution table from previous compressed + // "Repeat_Mode : reuse distribution table from previous compressed // block." // Nothing to do here, table will be unchanged if (!table->symbols) { @@ -1399,7 +1402,7 @@ size_t ZSTD_get_decompressed_size(const void *src, const size_t src_len) { /******* END OUTPUT SIZE COUNTING *********************************************/ /******* DICTIONARY PARSING ***************************************************/ -dictionary_t* create_dictionary() { +dictionary_t* create_dictionary(void) { dictionary_t* const dict = calloc(1, sizeof(dictionary_t)); if (!dict) { BAD_ALLOC(); diff --git a/third-party/zstd/doc/zstd_compression_format.md b/third-party/zstd/doc/zstd_compression_format.md index 3843bf39..7955dae4 100644 --- a/third-party/zstd/doc/zstd_compression_format.md +++ b/third-party/zstd/doc/zstd_compression_format.md @@ -16,7 +16,7 @@ Distribution of this document is unlimited. ### Version -0.3.9 (2023-03-08) +0.4.0 (2023-06-05) Introduction @@ -650,13 +650,15 @@ __`Number_of_Sequences`__ This is a variable size field using between 1 and 3 bytes. Let's call its first byte `byte0`. -- `if (byte0 == 0)` : there are no sequences. - The sequence section stops there. - Decompressed content is defined entirely as Literals Section content. - The FSE tables used in `Repeat_Mode` aren't updated. - `if (byte0 < 128)` : `Number_of_Sequences = byte0` . Uses 1 byte. -- `if (byte0 < 255)` : `Number_of_Sequences = ((byte0-128) << 8) + byte1` . Uses 2 bytes. -- `if (byte0 == 255)`: `Number_of_Sequences = byte1 + (byte2<<8) + 0x7F00` . Uses 3 bytes. +- `if (byte0 < 255)` : `Number_of_Sequences = ((byte0 - 0x80) << 8) + byte1`. Uses 2 bytes. + Note that the 2 bytes format fully overlaps the 1 byte format. +- `if (byte0 == 255)`: `Number_of_Sequences = byte1 + (byte2<<8) + 0x7F00`. Uses 3 bytes. + +`if (Number_of_Sequences == 0)` : there are no sequences. + The sequence section stops immediately, + FSE tables used in `Repeat_Mode` aren't updated. + Block's decompressed content is defined solely by the Literals Section content. __Symbol compression modes__ @@ -927,7 +929,10 @@ There is an exception though, when current sequence's `literals_length = 0`. In this case, repeated offsets are shifted by one, so an `offset_value` of 1 means `Repeated_Offset2`, an `offset_value` of 2 means `Repeated_Offset3`, -and an `offset_value` of 3 means `Repeated_Offset1 - 1_byte`. +and an `offset_value` of 3 means `Repeated_Offset1 - 1`. + +In the final case, if `Repeated_Offset1 - 1` evaluates to 0, then the +data is considered corrupted. For the first block, the starting offset history is populated with following values : `Repeated_Offset1`=1, `Repeated_Offset2`=4, `Repeated_Offset3`=8, @@ -1081,7 +1086,8 @@ It depends on : Presuming an `Accuracy_Log` of 8, and presuming 100 probabilities points have already been distributed, the decoder may read any value from `0` to `256 - 100 + 1 == 157` (inclusive). - Therefore, it must read `log2sup(157) == 8` bits. + Therefore, it may read up to `log2sup(157) == 8` bits, where `log2sup(N)` + is the smallest integer `T` that satisfies `(1 << T) > N`. - Value decoded : small values use 1 less bit : __example__ : @@ -1121,6 +1127,9 @@ When last symbol reaches cumulated total of `1 << Accuracy_Log`, decoding is complete. If the last symbol makes cumulated total go above `1 << Accuracy_Log`, distribution is considered corrupted. +If this process results in a non-zero probability for a value outside of the +valid range of values that the FSE table is defined for, even if that value is +not used, then the data is considered corrupted. Then the decoder can tell how many bytes were used in this process, and how many symbols are present. @@ -1184,9 +1193,9 @@ Baseline is assigned starting from the higher states using fewer bits, increasing at each state, then resuming at the first state, each state takes its allocated width from Baseline. -| state value | 1 | 39 | 77 | 84 | 122 | | state order | 0 | 1 | 2 | 3 | 4 | | ---------------- | ----- | ----- | ------ | ---- | ------ | +| state value | 1 | 39 | 77 | 84 | 122 | | width | 32 | 32 | 32 | 16 | 16 | | `Number_of_Bits` | 5 | 5 | 5 | 4 | 4 | | range number | 2 | 4 | 6 | 0 | 1 | @@ -1249,7 +1258,9 @@ Number_of_Bits = Weight ? (Max_Number_of_Bits + 1 - Weight) : 0 ``` When a literal value is not present, it receives a `Weight` of 0. The least frequent symbol receives a `Weight` of 1. -Consequently, the `Weight` 1 is necessarily present. +If no literal has a `Weight` of 1, then the data is considered corrupted. +If there are not at least two literals with non-zero `Weight`, then the data +is considered corrupted. The most frequent symbol receives a `Weight` anywhere between 1 and 11 (max). The last symbol's `Weight` is deduced from previously retrieved Weights, by completing to the nearest power of 2. It's necessarily non 0. @@ -1350,6 +1361,9 @@ If updating state after decoding a symbol would require more bits than remain in the stream, it is assumed that extra bits are 0. Then, symbols for each of the final states are decoded and the process is complete. +If this process would produce more weights than the maximum number of decoded +weights (255), then the data is considered corrupted. + #### Conversion from weights to Huffman prefix codes All present symbols shall now have a `Weight` value. @@ -1697,6 +1711,7 @@ or at least provide a meaningful error code explaining for which reason it canno Version changes --------------- +- 0.4.0 : fixed imprecise behavior for nbSeq==0, detected by Igor Pavlov - 0.3.9 : clarifications for Huffman-compressed literal sizes. - 0.3.8 : clarifications for Huffman Blocks and Huffman Tree descriptions. - 0.3.7 : clarifications for Repeat_Offsets, matching RFC8878 diff --git a/third-party/zstd/doc/zstd_manual.html b/third-party/zstd/doc/zstd_manual.html index dcc10208..bc4a2403 100644 --- a/third-party/zstd/doc/zstd_manual.html +++ b/third-party/zstd/doc/zstd_manual.html @@ -1,10 +1,10 @@ -zstd 1.5.5 Manual +zstd 1.5.6 Manual -

zstd 1.5.5 Manual

+

zstd 1.5.6 Manual


Contents

    @@ -156,7 +156,7 @@

    Helper functions

    /* ZSTD_compressBound() :
      * for example to size a static array on stack.
      * Will produce constant value 0 if srcSize too large.
      */
    -#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U)
    +#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U)
     #define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
     size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
     /* ZSTD_isError() :
    @@ -174,7 +174,7 @@ 

    Helper functions

    /* ZSTD_compressBound() :
     
     

    Compression context

      When compressing many times,
       it is recommended to allocate a context just once,
    -  and re-use it for each successive compression operation.
    +  and reuse it for each successive compression operation.
       This will make workload friendlier for system's memory.
       Note : re-using context is just a speed / resource optimization.
              It doesn't change the compression ratio, which remains identical.
    @@ -190,9 +190,9 @@ 

    Compression context

      When compressing many times,
                        const void* src, size_t srcSize,
                              int compressionLevel);
     

    Same as ZSTD_compress(), using an explicit ZSTD_CCtx. - Important : in order to behave similarly to `ZSTD_compress()`, - this function compresses at requested compression level, - __ignoring any other parameter__ . + Important : in order to mirror `ZSTD_compress()` behavior, + this function compresses at the requested compression level, + __ignoring any other advanced parameter__ . If any advanced parameter was set using the advanced API, they will all be reset. Only `compressionLevel` remains. @@ -200,7 +200,7 @@

    Compression context

      When compressing many times,
     
     

    Decompression context

      When decompressing many times,
       it is recommended to allocate a context only once,
    -  and re-use it for each successive compression operation.
    +  and reuse it for each successive compression operation.
       This will make workload friendlier for system's memory.
       Use one context per thread for parallel execution. 
     
    typedef struct ZSTD_DCtx_s ZSTD_DCtx;
    @@ -212,7 +212,7 @@ 

    Decompression context

      When decompressing many times,
                          const void* src, size_t srcSize);
     

    Same as ZSTD_decompress(), requires an allocated ZSTD_DCtx. - Compatible with sticky parameters. + Compatible with sticky parameters (see below).


    @@ -296,6 +296,19 @@

    Decompression context

      When decompressing many times,
                                   * The higher the value of selected strategy, the more complex it is,
                                   * resulting in stronger and slower compression.
                                   * Special: value 0 means "use default strategy". */
    +
    +    ZSTD_c_targetCBlockSize=130, /* v1.5.6+
    +                                  * Attempts to fit compressed block size into approximatively targetCBlockSize.
    +                                  * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX.
    +                                  * Note that it's not a guarantee, just a convergence target (default:0).
    +                                  * No target when targetCBlockSize == 0.
    +                                  * This is helpful in low bandwidth streaming environments to improve end-to-end latency,
    +                                  * when a client can make use of partial documents (a prominent example being Chrome).
    +                                  * Note: this parameter is stable since v1.5.6.
    +                                  * It was present as an experimental parameter in earlier versions,
    +                                  * but it's not recommended using it with earlier library versions
    +                                  * due to massive performance regressions.
    +                                  */
         /* LDM mode parameters */
         ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
                                          * This parameter is designed to improve compression ratio
    @@ -375,7 +388,6 @@ 

    Decompression context

      When decompressing many times,
          * ZSTD_c_forceMaxWindow
          * ZSTD_c_forceAttachDict
          * ZSTD_c_literalCompressionMode
    -     * ZSTD_c_targetCBlockSize
          * ZSTD_c_srcSizeHint
          * ZSTD_c_enableDedicatedDictSearch
          * ZSTD_c_stableInBuffer
    @@ -396,7 +408,7 @@ 

    Decompression context

      When decompressing many times,
          ZSTD_c_experimentalParam3=1000,
          ZSTD_c_experimentalParam4=1001,
          ZSTD_c_experimentalParam5=1002,
    -     ZSTD_c_experimentalParam6=1003,
    +     /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */
          ZSTD_c_experimentalParam7=1004,
          ZSTD_c_experimentalParam8=1005,
          ZSTD_c_experimentalParam9=1006,
    @@ -483,6 +495,7 @@ 

    Decompression context

      When decompressing many times,
                            void* dst, size_t dstCapacity,
                      const void* src, size_t srcSize);
     

    Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. + (note that this entry point doesn't even expose a compression level parameter). ZSTD_compress2() always starts a new frame. Should cctx hold data from a previously unfinished frame, everything about it is forgotten. - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() @@ -513,6 +526,7 @@

    Decompression context

      When decompressing many times,
          * ZSTD_d_forceIgnoreChecksum
          * ZSTD_d_refMultipleDDicts
          * ZSTD_d_disableHuffmanAssembly
    +     * ZSTD_d_maxBlockSize
          * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
          * note : never ever use experimentalParam? names directly
          */
    @@ -520,7 +534,8 @@ 

    Decompression context

      When decompressing many times,
          ZSTD_d_experimentalParam2=1001,
          ZSTD_d_experimentalParam3=1002,
          ZSTD_d_experimentalParam4=1003,
    -     ZSTD_d_experimentalParam5=1004
    +     ZSTD_d_experimentalParam5=1004,
    +     ZSTD_d_experimentalParam6=1005
     
     } ZSTD_dParameter;
     

    @@ -568,14 +583,14 @@

    Decompression context

      When decompressing many times,
       A ZSTD_CStream object is required to track streaming operation.
       Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
       ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
    -  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
    +  It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
     
       For parallel execution, use one separate ZSTD_CStream per thread.
     
       note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
     
       Parameters are sticky : when starting a new compression on the same context,
    -  it will re-use the same sticky parameters as previous compression session.
    +  it will reuse the same sticky parameters as previous compression session.
       When in doubt, it's recommended to fully initialize the context before usage.
       Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
       ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
    @@ -666,6 +681,11 @@ 

    Streaming compression functions

    typedef enum {
                 only ZSTD_e_end or ZSTD_e_flush operations are allowed.
                 Before starting a new compression job, or changing compression parameters,
                 it is required to fully flush internal buffers.
    +  - note: if an operation ends with an error, it may leave @cctx in an undefined state.
    +          Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state.
    +          In order to be re-employed after an error, a state must be reset,
    +          which can be done explicitly (ZSTD_CCtx_reset()),
    +          or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx())
      
     


    @@ -698,7 +718,7 @@

    Streaming compression functions

    typedef enum {
     

    Streaming decompression - HowTo

       A ZSTD_DStream object is required to track streaming operations.
       Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
    -  ZSTD_DStream objects can be re-used multiple times.
    +  ZSTD_DStream objects can be reused multiple times.
     
       Use ZSTD_initDStream() to start a new decompression operation.
      @return : recommended first input size
    @@ -751,6 +771,12 @@ 

    Streaming decompression functions


    @return : 0 when a frame is completely decoded and fully flushed, or an error code, which can be tested using ZSTD_isError(), or any other value > 0, which means there is some decoding or flushing to do to complete current frame. + + Note: when an operation returns with an error code, the @zds state may be left in undefined state. + It's UB to invoke `ZSTD_decompressStream()` on such a state. + In order to re-use such a state, it must be first reset, + which can be done explicitly (`ZSTD_DCtx_reset()`), + or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`)


    @@ -869,7 +895,7 @@

    Streaming decompression functions


    Advanced dictionary and prefix API (Requires v1.4.0+)

      This API allows dictionaries to be used with ZSTD_compress2(),
      ZSTD_compressStream2(), and ZSTD_decompressDCtx().
    - Dictionaries are sticky, they remain valid when same context is re-used,
    + Dictionaries are sticky, they remain valid when same context is reused,
      they only reset when the context is reset
      with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
      In contrast, Prefixes are single-use.
    @@ -1386,58 +1412,61 @@ 

    Streaming decompression functions


    Memory management

    
     
    -
    ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
    +
    ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel);
     ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
     ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
     ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
     

    These functions make it possible to estimate memory usage of a future {D,C}Ctx, before its creation. + This is useful in combination with ZSTD_initStatic(), + which makes it possible to employ a static buffer for ZSTD_CCtx* state. ZSTD_estimateCCtxSize() will provide a memory budget large enough - for any compression level up to selected one. - Note : Unlike ZSTD_estimateCStreamSize*(), this estimate - does not include space for a window buffer. - Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. + to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2() + associated with any compression level up to max specified one. The estimate will assume the input may be arbitrarily large, which is the worst case. + Note that the size estimation is specific for one-shot compression, + it is not valid for streaming (see ZSTD_estimateCStreamSize*()) + nor other potential ways of using a ZSTD_CCtx* state. + When srcSize can be bound by a known and rather "small" value, - this fact can be used to provide a tighter estimation - because the CCtx compression context will need less memory. - This tighter estimation can be provided by more advanced functions + this knowledge can be used to provide a tighter budget estimation + because the ZSTD_CCtx* state will need less memory for small inputs. + This tighter estimation can be provided by employing more advanced functions ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. Note : only single-threaded compression is supported. ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. - - Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time. - Size estimates assume that no external sequence producer is registered.


    -
    ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
    +
    ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel);
     ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
     ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
    -ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
    +ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize);
     ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
    -

    ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. - It will also consider src size to be arbitrarily "large", which is worst case. +

    ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression + using any compression level up to the max specified one. + It will also consider src size to be arbitrarily "large", which is a worst case scenario. If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. Note : CStream size estimation is only correct for single-threaded compression. - ZSTD_DStream memory budget depends on window Size. + ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. + Size estimates assume that no external sequence producer is registered. + + ZSTD_DStream memory budget depends on frame's window Size. This information can be passed manually, using ZSTD_estimateDStreamSize, or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); + Any frame requesting a window size larger than max specified one will be rejected. Note : if streaming is init with function ZSTD_init?Stream_usingDict(), an internal ?Dict will be created, which additional size is not estimated here. In this case, get total size by adding ZSTD_estimate?DictSize - Note 2 : only single-threaded compression is supported. - ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. - Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. - Size estimates assume that no external sequence producer is registered.


    @@ -1857,7 +1886,7 @@

    Advanced Streaming compression functions

    0, its value must be correct, as it will be written in header, and controlled at the end. @@ -1918,7 +1947,7 @@

    Advanced Streaming decompression functions

    ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); - re-use decompression parameters from previous init; saves dictionary loading + reuse decompression parameters from previous init; saves dictionary loading


    @@ -1926,7 +1955,7 @@

    Advanced Streaming decompression functions

    ZSTD_registerSequenceProducer( ZSTD_CCtx* cctx, void* sequenceProducerState, - ZSTD_sequenceProducer_F* sequenceProducer + ZSTD_sequenceProducer_F sequenceProducer );

    Instruct zstd to use a block-level external sequence producer function. @@ -1948,6 +1977,22 @@

    Advanced Streaming decompression functions

    calling this function.


    +
    ZSTDLIB_STATIC_API void
    +ZSTD_CCtxParams_registerSequenceProducer(
    +  ZSTD_CCtx_params* params,
    +  void* sequenceProducerState,
    +  ZSTD_sequenceProducer_F sequenceProducer
    +);
    +

    Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params. + This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(), + which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx(). + + If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx() + is required, then this function is for you. Otherwise, you probably don't need it. + + See tests/zstreamtest.c for example usage. +


    +

    Buffer-less and synchronous inner streaming functions (DEPRECATED)

       This API is deprecated, and will be removed in a future version.
       It allows streaming (de)compression with user allocated buffers.
    @@ -1964,7 +2009,7 @@ 

    Advanced Streaming decompression functions

    Buffer-less streaming compression (synchronous mode)

       A ZSTD_CCtx object is required to track streaming operations.
       Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
    -  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
    +  ZSTD_CCtx object can be reused multiple times within successive compression operations.
     
       Start by initializing a context.
       Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
    @@ -1985,7 +2030,7 @@ 

    Advanced Streaming decompression functions

    It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. - `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. + `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again.

    Buffer-less streaming compression functions

    ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
    @@ -2002,7 +2047,7 @@ 

    Buffer-less streaming compression functions

    ZSTD_DEPR
     

    Buffer-less streaming decompression (synchronous mode)

       A ZSTD_DCtx object is required to track streaming operations.
       Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
    -  A ZSTD_DCtx object can be re-used multiple times.
    +  A ZSTD_DCtx object can be reused multiple times.
     
       First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
       Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
    diff --git a/third-party/zstd/examples/streaming_compression.c b/third-party/zstd/examples/streaming_compression.c
    index ed0a3a69..063aa82a 100644
    --- a/third-party/zstd/examples/streaming_compression.c
    +++ b/third-party/zstd/examples/streaming_compression.c
    @@ -42,7 +42,13 @@ static void compressFile_orDie(const char* fname, const char* outName, int cLeve
          */
         CHECK_ZSTD( ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, cLevel) );
         CHECK_ZSTD( ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1) );
    -    ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers, nbThreads);
    +    if (nbThreads > 1) {
    +        size_t const r = ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers, nbThreads);
    +        if (ZSTD_isError(r)) {
    +            fprintf (stderr, "Note: the linked libzstd library doesn't support multithreading. "
    +                             "Reverting to single-thread mode. \n");
    +        }
    +    }
     
         /* This loop read from the input file, compresses that entire chunk,
          * and writes all output produced to the output file.
    @@ -117,7 +123,7 @@ int main(int argc, const char** argv)
         }
     
         int cLevel = 1;
    -    int nbThreads = 4;
    +    int nbThreads = 1;
     
         if (argc >= 3) {
           cLevel = atoi (argv[2]);
    diff --git a/third-party/zstd/lib/Makefile b/third-party/zstd/lib/Makefile
    index a4cf61ab..8bfdade9 100644
    --- a/third-party/zstd/lib/Makefile
    +++ b/third-party/zstd/lib/Makefile
    @@ -8,6 +8,9 @@
     # You may select, at your option, one of the above-listed licenses.
     # ################################################################
     
    +# default target (when running `make` with no argument)
    +lib-release:
    +
     # Modules
     ZSTD_LIB_COMPRESSION ?= 1
     ZSTD_LIB_DECOMPRESSION ?= 1
    @@ -54,12 +57,11 @@ VERSION := $(ZSTD_VERSION)
     # Note: by default, the static library is built single-threaded and dynamic library is built
     # multi-threaded. It is possible to force multi or single threaded builds by appending
     # -mt or -nomt to the build target (like lib-mt for multi-threaded, lib-nomt for single-threaded).
    -.PHONY: default
    -default: lib-release
    +
     
     CPPFLAGS_DYNLIB  += -DZSTD_MULTITHREAD # dynamic library build defaults to multi-threaded
     LDFLAGS_DYNLIB   += -pthread
    -CPPFLAGS_STATLIB +=                    # static library build defaults to single-threaded
    +CPPFLAGS_STATICLIB +=                  # static library build defaults to single-threaded
     
     
     ifeq ($(findstring GCC,$(CCVER)),GCC)
    @@ -91,7 +93,7 @@ all: lib
     
     
     .PHONY: libzstd.a  # must be run every time
    -libzstd.a: CPPFLAGS += $(CPPFLAGS_STATLIB)
    +libzstd.a: CPPFLAGS += $(CPPFLAGS_STATICLIB)
     
     SET_CACHE_DIRECTORY = \
        +$(MAKE) --no-print-directory $@ \
    @@ -109,19 +111,19 @@ libzstd.a:
     else
     # BUILD_DIR is defined
     
    -ZSTD_STATLIB_DIR := $(BUILD_DIR)/static
    -ZSTD_STATLIB := $(ZSTD_STATLIB_DIR)/libzstd.a
    -ZSTD_STATLIB_OBJ := $(addprefix $(ZSTD_STATLIB_DIR)/,$(ZSTD_LOCAL_OBJ))
    -$(ZSTD_STATLIB): ARFLAGS = rcs
    -$(ZSTD_STATLIB): | $(ZSTD_STATLIB_DIR)
    -$(ZSTD_STATLIB): $(ZSTD_STATLIB_OBJ)
    +ZSTD_STATICLIB_DIR := $(BUILD_DIR)/static
    +ZSTD_STATICLIB := $(ZSTD_STATICLIB_DIR)/libzstd.a
    +ZSTD_STATICLIB_OBJ := $(addprefix $(ZSTD_STATICLIB_DIR)/,$(ZSTD_LOCAL_OBJ))
    +$(ZSTD_STATICLIB): ARFLAGS = rcs
    +$(ZSTD_STATICLIB): | $(ZSTD_STATICLIB_DIR)
    +$(ZSTD_STATICLIB): $(ZSTD_STATICLIB_OBJ)
       # Check for multithread flag at target execution time
     	$(if $(filter -DZSTD_MULTITHREAD,$(CPPFLAGS)),\
         @echo compiling multi-threaded static library $(LIBVER),\
         @echo compiling single-threaded static library $(LIBVER))
     	$(AR) $(ARFLAGS) $@ $^
     
    -libzstd.a: $(ZSTD_STATLIB)
    +libzstd.a: $(ZSTD_STATICLIB)
     	cp -f $< $@
     
     endif
    @@ -182,14 +184,14 @@ lib : libzstd.a libzstd
     # make does not consider implicit pattern rule for .PHONY target
     
     %-mt : CPPFLAGS_DYNLIB  := -DZSTD_MULTITHREAD
    -%-mt : CPPFLAGS_STATLIB := -DZSTD_MULTITHREAD
    +%-mt : CPPFLAGS_STATICLIB := -DZSTD_MULTITHREAD
     %-mt : LDFLAGS_DYNLIB   := -pthread
     %-mt : %
     	@echo multi-threaded build completed
     
     %-nomt : CPPFLAGS_DYNLIB  :=
     %-nomt : LDFLAGS_DYNLIB   :=
    -%-nomt : CPPFLAGS_STATLIB :=
    +%-nomt : CPPFLAGS_STATICLIB :=
     %-nomt : %
     	@echo single-threaded build completed
     
    @@ -200,42 +202,52 @@ lib : libzstd.a libzstd
     
     # Generate .h dependencies automatically
     
    -DEPFLAGS = -MT $@ -MMD -MP -MF
    +# -MMD: compiler generates dependency information as a side-effect of compilation, without system headers
    +# -MP: adds phony target for each dependency other than main file.
    +DEPFLAGS = -MMD -MP
     
    -$(ZSTD_DYNLIB_DIR)/%.o : %.c $(ZSTD_DYNLIB_DIR)/%.d | $(ZSTD_DYNLIB_DIR)
    +# ensure that ZSTD_DYNLIB_DIR exists prior to generating %.o
    +$(ZSTD_DYNLIB_DIR)/%.o : %.c | $(ZSTD_DYNLIB_DIR)
     	@echo CC $@
    -	$(COMPILE.c) $(DEPFLAGS) $(ZSTD_DYNLIB_DIR)/$*.d $(OUTPUT_OPTION) $<
    +	$(COMPILE.c) $(DEPFLAGS) $(OUTPUT_OPTION) $<
     
    -$(ZSTD_STATLIB_DIR)/%.o : %.c $(ZSTD_STATLIB_DIR)/%.d | $(ZSTD_STATLIB_DIR)
    +$(ZSTD_STATICLIB_DIR)/%.o : %.c | $(ZSTD_STATICLIB_DIR)
     	@echo CC $@
    -	$(COMPILE.c) $(DEPFLAGS) $(ZSTD_STATLIB_DIR)/$*.d $(OUTPUT_OPTION) $<
    +	$(COMPILE.c) $(DEPFLAGS) $(OUTPUT_OPTION) $<
     
     $(ZSTD_DYNLIB_DIR)/%.o : %.S | $(ZSTD_DYNLIB_DIR)
     	@echo AS $@
     	$(COMPILE.S) $(OUTPUT_OPTION) $<
     
    -$(ZSTD_STATLIB_DIR)/%.o : %.S | $(ZSTD_STATLIB_DIR)
    +$(ZSTD_STATICLIB_DIR)/%.o : %.S | $(ZSTD_STATICLIB_DIR)
     	@echo AS $@
     	$(COMPILE.S) $(OUTPUT_OPTION) $<
     
    -MKDIR ?= mkdir
    -$(BUILD_DIR) $(ZSTD_DYNLIB_DIR) $(ZSTD_STATLIB_DIR):
    -	$(MKDIR) -p $@
    +MKDIR ?= mkdir -p
    +$(BUILD_DIR) $(ZSTD_DYNLIB_DIR) $(ZSTD_STATICLIB_DIR):
    +	$(MKDIR) $@
     
    -DEPFILES := $(ZSTD_DYNLIB_OBJ:.o=.d) $(ZSTD_STATLIB_OBJ:.o=.d)
    +DEPFILES := $(ZSTD_DYNLIB_OBJ:.o=.d) $(ZSTD_STATICLIB_OBJ:.o=.d)
     $(DEPFILES):
     
    -include $(wildcard $(DEPFILES))
    +# The leading '-' means: do not fail is include fails (ex: directory does not exist yet)
    +-include $(wildcard $(DEPFILES))
     
     
    -# Special case : building library in single-thread mode _and_ without zstdmt_compress.c
    -ZSTDMT_FILES = compress/zstdmt_compress.c
    -ZSTD_NOMT_FILES = $(filter-out $(ZSTDMT_FILES),$(ZSTD_FILES))
    +# Special case : build library in single-thread mode _and_ without zstdmt_compress.c
    +# Note : we still need threading.c and pool.c for the dictionary builder,
    +# but they will correctly behave single-threaded.
    +ZSTDMT_FILES = zstdmt_compress.c
    +ZSTD_NOMT_FILES = $(filter-out $(ZSTDMT_FILES),$(notdir $(ZSTD_FILES)))
     libzstd-nomt: CFLAGS += -fPIC -fvisibility=hidden
     libzstd-nomt: LDFLAGS += -shared
     libzstd-nomt: $(ZSTD_NOMT_FILES)
     	@echo compiling single-thread dynamic library $(LIBVER)
     	@echo files : $(ZSTD_NOMT_FILES)
    +	@if echo "$(ZSTD_NOMT_FILES)" | tr ' ' '\n' | $(GREP) -q zstdmt; then \
    +        echo "Error: Found zstdmt in list."; \
    +        exit 1; \
    +    fi
     	$(CC) $(FLAGS) $^ $(LDFLAGS) $(SONAME_FLAGS) -o $@
     
     .PHONY: clean
    @@ -249,7 +261,7 @@ clean:
     #-----------------------------------------------------------------------------
     # make install is validated only for below listed environments
     #-----------------------------------------------------------------------------
    -ifneq (,$(filter $(UNAME),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS Haiku AIX))
    +ifneq (,$(filter $(UNAME),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS Haiku AIX MSYS_NT CYGWIN_NT))
     
     lib: libzstd.pc
     
    diff --git a/third-party/zstd/lib/README.md b/third-party/zstd/lib/README.md
    index c3b5d181..a560f06c 100644
    --- a/third-party/zstd/lib/README.md
    +++ b/third-party/zstd/lib/README.md
    @@ -88,7 +88,7 @@ The file structure is designed to make this selection manually achievable for an
             For example, advanced API for version `v0.4` is exposed in `lib/legacy/zstd_v04.h` .
     
     - While invoking `make libzstd`, it's possible to define build macros
    -        `ZSTD_LIB_COMPRESSION, ZSTD_LIB_DECOMPRESSION`, `ZSTD_LIB_DICTBUILDER`,
    +        `ZSTD_LIB_COMPRESSION`, `ZSTD_LIB_DECOMPRESSION`, `ZSTD_LIB_DICTBUILDER`,
             and `ZSTD_LIB_DEPRECATED` as `0` to forgo compilation of the
             corresponding features. This will also disable compilation of all
             dependencies (e.g. `ZSTD_LIB_COMPRESSION=0` will also disable
    @@ -119,6 +119,15 @@ The file structure is designed to make this selection manually achievable for an
       binary is achieved by using `HUF_FORCE_DECOMPRESS_X1` and
       `ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT` (implied by `ZSTD_LIB_MINIFY`).
     
    +  On the compressor side, Zstd's compression levels map to several internal
    +  strategies. In environments where the higher compression levels aren't used,
    +  it is possible to exclude all but the fastest strategy with
    +  `ZSTD_LIB_EXCLUDE_COMPRESSORS_DFAST_AND_UP=1`. (Note that this will change
    +  the behavior of the default compression level.) Or if you want to retain the
    +  default compressor as well, you can set
    +  `ZSTD_LIB_EXCLUDE_COMPRESSORS_GREEDY_AND_UP=1`, at the cost of an additional
    +  ~20KB or so.
    +
       For squeezing the last ounce of size out, you can also define
       `ZSTD_NO_INLINE`, which disables inlining, and `ZSTD_STRIP_ERROR_STRINGS`,
       which removes the error messages that are otherwise returned by
    @@ -169,6 +178,10 @@ The file structure is designed to make this selection manually achievable for an
       `ZSTDERRORLIB_VSIBILITY`, and `ZDICTLIB_VISIBILITY` if unset, for backwards compatibility
       with the old macro names.
     
    +- The C compiler macro `HUF_DISABLE_FAST_DECODE` disables the newer Huffman fast C
    +  and assembly decoding loops. You may want to use this macro if these loops are
    +  slower on your platform.
    +
     #### Windows : using MinGW+MSYS to create DLL
     
     DLL can be created using MinGW+MSYS with the `make libzstd` command.
    diff --git a/third-party/zstd/lib/common/allocations.h b/third-party/zstd/lib/common/allocations.h
    index a3153c4b..5e899550 100644
    --- a/third-party/zstd/lib/common/allocations.h
    +++ b/third-party/zstd/lib/common/allocations.h
    @@ -14,7 +14,7 @@
     #define ZSTD_DEPS_NEED_MALLOC
     #include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
     
    -#include "mem.h" /* MEM_STATIC */
    +#include "compiler.h" /* MEM_STATIC */
     #define ZSTD_STATIC_LINKING_ONLY
     #include "../zstd.h" /* ZSTD_customMem */
     
    diff --git a/third-party/zstd/lib/common/bitstream.h b/third-party/zstd/lib/common/bitstream.h
    index 72b0b3df..67604498 100644
    --- a/third-party/zstd/lib/common/bitstream.h
    +++ b/third-party/zstd/lib/common/bitstream.h
    @@ -90,19 +90,20 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
     /*-********************************************
     *  bitStream decoding API (read backward)
     **********************************************/
    +typedef size_t BitContainerType;
     typedef struct {
    -    size_t   bitContainer;
    +    BitContainerType bitContainer;
         unsigned bitsConsumed;
         const char* ptr;
         const char* start;
         const char* limitPtr;
     } BIT_DStream_t;
     
    -typedef enum { BIT_DStream_unfinished = 0,
    -               BIT_DStream_endOfBuffer = 1,
    -               BIT_DStream_completed = 2,
    -               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
    -               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
    +typedef enum { BIT_DStream_unfinished = 0,  /* fully refilled */
    +               BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */
    +               BIT_DStream_completed = 2,   /* bitstream entirely consumed, bit-exact */
    +               BIT_DStream_overflow = 3     /* user requested more bits than present in bitstream */
    +    } BIT_DStream_status;  /* result of BIT_reloadDStream() */
     
     MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
     MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
    @@ -112,7 +113,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
     
     /* Start by invoking BIT_initDStream().
     *  A chunk of the bitStream is then stored into a local register.
    -*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
    +*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType).
     *  You can then retrieve bitFields stored into the local register, **in reverse order**.
     *  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
     *  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
    @@ -162,7 +163,7 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
         return 0;
     }
     
    -MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
    +FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
     {
     #if defined(STATIC_BMI2) && STATIC_BMI2 == 1 && !defined(ZSTD_NO_INTRINSICS)
         return  _bzhi_u64(bitContainer, nbBits);
    @@ -267,22 +268,22 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
             bitD->bitContainer = *(const BYTE*)(bitD->start);
             switch(srcSize)
             {
    -        case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
    +        case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
                     ZSTD_FALLTHROUGH;
     
    -        case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
    +        case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
                     ZSTD_FALLTHROUGH;
     
    -        case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
    +        case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
                     ZSTD_FALLTHROUGH;
     
    -        case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
    +        case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24;
                     ZSTD_FALLTHROUGH;
     
    -        case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
    +        case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16;
                     ZSTD_FALLTHROUGH;
     
    -        case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
    +        case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) <<  8;
                     ZSTD_FALLTHROUGH;
     
             default: break;
    @@ -297,12 +298,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
         return srcSize;
     }
     
    -MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
    +FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start)
     {
         return bitContainer >> start;
     }
     
    -MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
    +FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits)
     {
         U32 const regMask = sizeof(bitContainer)*8 - 1;
         /* if start > regMask, bitstream is corrupted, and result is undefined */
    @@ -325,7 +326,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
      *  On 32-bits, maxNbBits==24.
      *  On 64-bits, maxNbBits==56.
      * @return : value extracted */
    -MEM_STATIC  FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
    +FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
     {
         /* arbitrate between double-shift and shift+mask */
     #if 1
    @@ -348,7 +349,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
         return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
     }
     
    -MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
    +FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
     {
         bitD->bitsConsumed += nbBits;
     }
    @@ -357,7 +358,7 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
      *  Read (consume) next n bits from local register and update.
      *  Pay attention to not read more than nbBits contained into local register.
      * @return : extracted value. */
    -MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
    +FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
     {
         size_t const value = BIT_lookBits(bitD, nbBits);
         BIT_skipBits(bitD, nbBits);
    @@ -374,6 +375,21 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
         return value;
     }
     
    +/*! BIT_reloadDStream_internal() :
    + *  Simple variant of BIT_reloadDStream(), with two conditions:
    + *  1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8
    + *  2. look window is valid after shifted down : bitD->ptr >= bitD->start
    + */
    +MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD)
    +{
    +    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
    +    bitD->ptr -= bitD->bitsConsumed >> 3;
    +    assert(bitD->ptr >= bitD->start);
    +    bitD->bitsConsumed &= 7;
    +    bitD->bitContainer = MEM_readLEST(bitD->ptr);
    +    return BIT_DStream_unfinished;
    +}
    +
     /*! BIT_reloadDStreamFast() :
      *  Similar to BIT_reloadDStream(), but with two differences:
      *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
    @@ -384,31 +400,35 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
     {
         if (UNLIKELY(bitD->ptr < bitD->limitPtr))
             return BIT_DStream_overflow;
    -    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
    -    bitD->ptr -= bitD->bitsConsumed >> 3;
    -    bitD->bitsConsumed &= 7;
    -    bitD->bitContainer = MEM_readLEST(bitD->ptr);
    -    return BIT_DStream_unfinished;
    +    return BIT_reloadDStream_internal(bitD);
     }
     
     /*! BIT_reloadDStream() :
      *  Refill `bitD` from buffer previously set in BIT_initDStream() .
    - *  This function is safe, it guarantees it will not read beyond src buffer.
    + *  This function is safe, it guarantees it will not never beyond src buffer.
      * @return : status of `BIT_DStream_t` internal register.
      *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
    -MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
    +FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
     {
    -    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
    +    /* note : once in overflow mode, a bitstream remains in this mode until it's reset */
    +    if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) {
    +        static const BitContainerType zeroFilled = 0;
    +        bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */
    +        /* overflow detected, erroneous scenario or end of stream: no update */
             return BIT_DStream_overflow;
    +    }
    +
    +    assert(bitD->ptr >= bitD->start);
     
         if (bitD->ptr >= bitD->limitPtr) {
    -        return BIT_reloadDStreamFast(bitD);
    +        return BIT_reloadDStream_internal(bitD);
         }
         if (bitD->ptr == bitD->start) {
    +        /* reached end of bitStream => no update */
             if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
             return BIT_DStream_completed;
         }
    -    /* start < ptr < limitPtr */
    +    /* start < ptr < limitPtr => cautious update */
         {   U32 nbBytes = bitD->bitsConsumed >> 3;
             BIT_DStream_status result = BIT_DStream_unfinished;
             if (bitD->ptr - nbBytes < bitD->start) {
    diff --git a/third-party/zstd/lib/common/compiler.h b/third-party/zstd/lib/common/compiler.h
    index 73f8d019..31880ecb 100644
    --- a/third-party/zstd/lib/common/compiler.h
    +++ b/third-party/zstd/lib/common/compiler.h
    @@ -11,6 +11,8 @@
     #ifndef ZSTD_COMPILER_H
     #define ZSTD_COMPILER_H
     
    +#include 
    +
     #include "portability_macros.h"
     
     /*-*******************************************************
    @@ -51,12 +53,19 @@
     #  define WIN_CDECL
     #endif
     
    +/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
    +#if defined(__GNUC__)
    +#  define UNUSED_ATTR __attribute__((unused))
    +#else
    +#  define UNUSED_ATTR
    +#endif
    +
     /**
      * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
      * parameters. They must be inlined for the compiler to eliminate the constant
      * branches.
      */
    -#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
    +#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR
     /**
      * HINT_INLINE is used to help the compiler generate better code. It is *not*
      * used for "templates", so it can be tweaked based on the compilers
    @@ -71,14 +80,28 @@
     #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
     #  define HINT_INLINE static INLINE_KEYWORD
     #else
    -#  define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
    +#  define HINT_INLINE FORCE_INLINE_TEMPLATE
     #endif
     
    -/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
    +/* "soft" inline :
    + * The compiler is free to select if it's a good idea to inline or not.
    + * The main objective is to silence compiler warnings
    + * when a defined function in included but not used.
    + *
    + * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit.
    + * Updating the prefix is probably preferable, but requires a fairly large codemod,
    + * since this name is used everywhere.
    + */
    +#ifndef MEM_STATIC  /* already defined in Linux Kernel mem.h */
     #if defined(__GNUC__)
    -#  define UNUSED_ATTR __attribute__((unused))
    +#  define MEM_STATIC static __inline UNUSED_ATTR
    +#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
    +#  define MEM_STATIC static inline
    +#elif defined(_MSC_VER)
    +#  define MEM_STATIC static __inline
     #else
    -#  define UNUSED_ATTR
    +#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
    +#endif
     #endif
     
     /* force no inlining */
    @@ -109,10 +132,10 @@
     /* prefetch
      * can be disabled, by declaring NO_PREFETCH build macro */
     #if defined(NO_PREFETCH)
    -#  define PREFETCH_L1(ptr)  (void)(ptr)  /* disabled */
    -#  define PREFETCH_L2(ptr)  (void)(ptr)  /* disabled */
    +#  define PREFETCH_L1(ptr)  do { (void)(ptr); } while (0)  /* disabled */
    +#  define PREFETCH_L2(ptr)  do { (void)(ptr); } while (0)  /* disabled */
     #else
    -#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
    +#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC)  /* _mm_prefetch() is not defined outside of x86/x64 */
     #    include    /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
     #    define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
     #    define PREFETCH_L2(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
    @@ -120,24 +143,25 @@
     #    define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
     #    define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
     #  elif defined(__aarch64__)
    -#    define PREFETCH_L1(ptr)  __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
    -#    define PREFETCH_L2(ptr)  __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
    +#    define PREFETCH_L1(ptr)  do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
    +#    define PREFETCH_L2(ptr)  do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
     #  else
    -#    define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
    -#    define PREFETCH_L2(ptr) (void)(ptr)  /* disabled */
    +#    define PREFETCH_L1(ptr) do { (void)(ptr); } while (0)  /* disabled */
    +#    define PREFETCH_L2(ptr) do { (void)(ptr); } while (0)  /* disabled */
     #  endif
     #endif  /* NO_PREFETCH */
     
     #define CACHELINE_SIZE 64
     
    -#define PREFETCH_AREA(p, s)  {            \
    -    const char* const _ptr = (const char*)(p);  \
    -    size_t const _size = (size_t)(s);     \
    -    size_t _pos;                          \
    -    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
    -        PREFETCH_L2(_ptr + _pos);         \
    -    }                                     \
    -}
    +#define PREFETCH_AREA(p, s)                              \
    +    do {                                                 \
    +        const char* const _ptr = (const char*)(p);       \
    +        size_t const _size = (size_t)(s);                \
    +        size_t _pos;                                     \
    +        for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \
    +            PREFETCH_L2(_ptr + _pos);                    \
    +        }                                                \
    +    } while (0)
     
     /* vectorization
      * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
    @@ -166,9 +190,9 @@
     #endif
     
     #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
    -#  define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); }
    +#  define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0)
     #else
    -#  define ZSTD_UNREACHABLE { assert(0); }
    +#  define ZSTD_UNREACHABLE do { assert(0); } while (0)
     #endif
     
     /* disable warnings */
    @@ -281,6 +305,74 @@
     *  Sanitizer
     *****************************************************************/
     
    +/**
    + * Zstd relies on pointer overflow in its decompressor.
    + * We add this attribute to functions that rely on pointer overflow.
    + */
    +#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +#  if __has_attribute(no_sanitize)
    +#    if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8
    +       /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */
    +#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow")))
    +#    else
    +       /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */
    +#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow")))
    +#    endif
    +#  else
    +#    define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +#  endif
    +#endif
    +
    +/**
    + * Helper function to perform a wrapped pointer difference without trigging
    + * UBSAN.
    + *
    + * @returns lhs - rhs with wrapping
    + */
    +MEM_STATIC
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs)
    +{
    +    return lhs - rhs;
    +}
    +
    +/**
    + * Helper function to perform a wrapped pointer add without triggering UBSAN.
    + *
    + * @return ptr + add with wrapping
    + */
    +MEM_STATIC
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add)
    +{
    +    return ptr + add;
    +}
    +
    +/**
    + * Helper function to perform a wrapped pointer subtraction without triggering
    + * UBSAN.
    + *
    + * @return ptr - sub with wrapping
    + */
    +MEM_STATIC
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub)
    +{
    +    return ptr - sub;
    +}
    +
    +/**
    + * Helper function to add to a pointer that works around C's undefined behavior
    + * of adding 0 to NULL.
    + *
    + * @returns `ptr + add` except it defines `NULL + 0 == NULL`.
    + */
    +MEM_STATIC
    +unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add)
    +{
    +    return add > 0 ? ptr + add : ptr;
    +}
    +
     /* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
      * abundance of caution, disable our custom poisoning on mingw. */
     #ifdef __MINGW32__
    diff --git a/third-party/zstd/lib/common/cpu.h b/third-party/zstd/lib/common/cpu.h
    index 8bc34a36..0e684d9a 100644
    --- a/third-party/zstd/lib/common/cpu.h
    +++ b/third-party/zstd/lib/common/cpu.h
    @@ -35,6 +35,7 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
         U32 f7b = 0;
         U32 f7c = 0;
     #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
    +#if !defined(__clang__)
         int reg[4];
         __cpuid((int*)reg, 0);
         {
    @@ -50,6 +51,41 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
                 f7c = (U32)reg[2];
             }
         }
    +#else
    +    /* Clang compiler has a bug (fixed in https://reviews.llvm.org/D101338) in
    +     * which the `__cpuid` intrinsic does not save and restore `rbx` as it needs
    +     * to due to being a reserved register. So in that case, do the `cpuid`
    +     * ourselves. Clang supports inline assembly anyway.
    +     */
    +    U32 n;
    +    __asm__(
    +        "pushq %%rbx\n\t"
    +        "cpuid\n\t"
    +        "popq %%rbx\n\t"
    +        : "=a"(n)
    +        : "a"(0)
    +        : "rcx", "rdx");
    +    if (n >= 1) {
    +      U32 f1a;
    +      __asm__(
    +          "pushq %%rbx\n\t"
    +          "cpuid\n\t"
    +          "popq %%rbx\n\t"
    +          : "=a"(f1a), "=c"(f1c), "=d"(f1d)
    +          : "a"(1)
    +          :);
    +    }
    +    if (n >= 7) {
    +      __asm__(
    +          "pushq %%rbx\n\t"
    +          "cpuid\n\t"
    +          "movq %%rbx, %%rax\n\t"
    +          "popq %%rbx"
    +          : "=a"(f7b), "=c"(f7c)
    +          : "a"(7), "c"(0)
    +          : "rdx");
    +    }
    +#endif
     #elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__)
         /* The following block like the normal cpuid branch below, but gcc
          * reserves ebx for use of its pic register so we must specially
    diff --git a/third-party/zstd/lib/common/debug.c b/third-party/zstd/lib/common/debug.c
    index ebf7bfcc..9d0b7d22 100644
    --- a/third-party/zstd/lib/common/debug.c
    +++ b/third-party/zstd/lib/common/debug.c
    @@ -21,4 +21,10 @@
     
     #include "debug.h"
     
    +#if !defined(ZSTD_LINUX_KERNEL) || (DEBUGLEVEL>=2)
    +/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a
    + * translation unit is empty. So remove this from Linux kernel builds, but
    + * otherwise just leave it in.
    + */
     int g_debuglevel = DEBUGLEVEL;
    +#endif
    diff --git a/third-party/zstd/lib/common/debug.h b/third-party/zstd/lib/common/debug.h
    index 0e9817ea..a16b69e5 100644
    --- a/third-party/zstd/lib/common/debug.h
    +++ b/third-party/zstd/lib/common/debug.h
    @@ -85,18 +85,27 @@ extern int g_debuglevel; /* the variable is only declared,
                                 It's useful when enabling very verbose levels
                                 on selective conditions (such as position in src) */
     
    -#  define RAWLOG(l, ...) {                                       \
    -                if (l<=g_debuglevel) {                           \
    -                    ZSTD_DEBUG_PRINT(__VA_ARGS__);               \
    -            }   }
    -#  define DEBUGLOG(l, ...) {                                     \
    -                if (l<=g_debuglevel) {                           \
    -                    ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \
    -                    ZSTD_DEBUG_PRINT(" \n");                     \
    -            }   }
    +#  define RAWLOG(l, ...)                   \
    +    do {                                   \
    +        if (l<=g_debuglevel) {             \
    +            ZSTD_DEBUG_PRINT(__VA_ARGS__); \
    +        }                                  \
    +    } while (0)
    +
    +#define STRINGIFY(x) #x
    +#define TOSTRING(x) STRINGIFY(x)
    +#define LINE_AS_STRING TOSTRING(__LINE__)
    +
    +#  define DEBUGLOG(l, ...)                               \
    +    do {                                                 \
    +        if (l<=g_debuglevel) {                           \
    +            ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \
    +            ZSTD_DEBUG_PRINT(" \n");                     \
    +        }                                                \
    +    } while (0)
     #else
    -#  define RAWLOG(l, ...)      {}    /* disabled */
    -#  define DEBUGLOG(l, ...)    {}    /* disabled */
    +#  define RAWLOG(l, ...)   do { } while (0)    /* disabled */
    +#  define DEBUGLOG(l, ...) do { } while (0)    /* disabled */
     #endif
     
     
    diff --git a/third-party/zstd/lib/common/error_private.h b/third-party/zstd/lib/common/error_private.h
    index 325daad4..0156010c 100644
    --- a/third-party/zstd/lib/common/error_private.h
    +++ b/third-party/zstd/lib/common/error_private.h
    @@ -60,8 +60,13 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
     ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
     
     /* check and forward error code */
    -#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
    -#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
    +#define CHECK_V_F(e, f)     \
    +    size_t const e = f;     \
    +    do {                    \
    +        if (ERR_isError(e)) \
    +            return e;       \
    +    } while (0)
    +#define CHECK_F(f)   do { CHECK_V_F(_var_err__, f); } while (0)
     
     
     /*-****************************************
    @@ -95,10 +100,12 @@ void _force_has_format_string(const char *format, ...) {
      * We want to force this function invocation to be syntactically correct, but
      * we don't want to force runtime evaluation of its arguments.
      */
    -#define _FORCE_HAS_FORMAT_STRING(...) \
    -  if (0) { \
    -    _force_has_format_string(__VA_ARGS__); \
    -  }
    +#define _FORCE_HAS_FORMAT_STRING(...)              \
    +    do {                                           \
    +        if (0) {                                   \
    +            _force_has_format_string(__VA_ARGS__); \
    +        }                                          \
    +    } while (0)
     
     #define ERR_QUOTE(str) #str
     
    @@ -109,48 +116,50 @@ void _force_has_format_string(const char *format, ...) {
      * In order to do that (particularly, printing the conditional that failed),
      * this can't just wrap RETURN_ERROR().
      */
    -#define RETURN_ERROR_IF(cond, err, ...) \
    -  if (cond) { \
    -    RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
    -           __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
    -    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
    -    RAWLOG(3, ": " __VA_ARGS__); \
    -    RAWLOG(3, "\n"); \
    -    return ERROR(err); \
    -  }
    +#define RETURN_ERROR_IF(cond, err, ...)                                        \
    +    do {                                                                       \
    +        if (cond) {                                                            \
    +            RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s",          \
    +                  __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
    +            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                             \
    +            RAWLOG(3, ": " __VA_ARGS__);                                       \
    +            RAWLOG(3, "\n");                                                   \
    +            return ERROR(err);                                                 \
    +        }                                                                      \
    +    } while (0)
     
     /**
      * Unconditionally return the specified error.
      *
      * In debug modes, prints additional information.
      */
    -#define RETURN_ERROR(err, ...) \
    -  do { \
    -    RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
    -           __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \
    -    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
    -    RAWLOG(3, ": " __VA_ARGS__); \
    -    RAWLOG(3, "\n"); \
    -    return ERROR(err); \
    -  } while(0);
    +#define RETURN_ERROR(err, ...)                                               \
    +    do {                                                                     \
    +        RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
    +              __FILE__, __LINE__, ERR_QUOTE(ERROR(err)));                    \
    +        _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                               \
    +        RAWLOG(3, ": " __VA_ARGS__);                                         \
    +        RAWLOG(3, "\n");                                                     \
    +        return ERROR(err);                                                   \
    +    } while(0)
     
     /**
      * If the provided expression evaluates to an error code, returns that error code.
      *
      * In debug modes, prints additional information.
      */
    -#define FORWARD_IF_ERROR(err, ...) \
    -  do { \
    -    size_t const err_code = (err); \
    -    if (ERR_isError(err_code)) { \
    -      RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
    -             __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
    -      _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
    -      RAWLOG(3, ": " __VA_ARGS__); \
    -      RAWLOG(3, "\n"); \
    -      return err_code; \
    -    } \
    -  } while(0);
    +#define FORWARD_IF_ERROR(err, ...)                                                 \
    +    do {                                                                           \
    +        size_t const err_code = (err);                                             \
    +        if (ERR_isError(err_code)) {                                               \
    +            RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s",                 \
    +                  __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
    +            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                                 \
    +            RAWLOG(3, ": " __VA_ARGS__);                                           \
    +            RAWLOG(3, "\n");                                                       \
    +            return err_code;                                                       \
    +        }                                                                          \
    +    } while(0)
     
     #if defined (__cplusplus)
     }
    diff --git a/third-party/zstd/lib/common/fse.h b/third-party/zstd/lib/common/fse.h
    index 02a1f0bc..2ae128e6 100644
    --- a/third-party/zstd/lib/common/fse.h
    +++ b/third-party/zstd/lib/common/fse.h
    @@ -229,6 +229,7 @@ If there is an error, the function will return an error code, which can be teste
     
     #endif  /* FSE_H */
     
    +
     #if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
     #define FSE_H_FSE_STATIC_LINKING_ONLY
     
    @@ -464,13 +465,13 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un
         FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
         const U16* const stateTable = (const U16*)(statePtr->stateTable);
         U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
    -    BIT_addBits(bitC, statePtr->value, nbBitsOut);
    +    BIT_addBits(bitC,  (size_t)statePtr->value, nbBitsOut);
         statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
     }
     
     MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
     {
    -    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
    +    BIT_addBits(bitC, (size_t)statePtr->value, statePtr->stateLog);
         BIT_flushBits(bitC);
     }
     
    diff --git a/third-party/zstd/lib/common/fse_decompress.c b/third-party/zstd/lib/common/fse_decompress.c
    index 1e1c9f92..0dcc4640 100644
    --- a/third-party/zstd/lib/common/fse_decompress.c
    +++ b/third-party/zstd/lib/common/fse_decompress.c
    @@ -22,8 +22,7 @@
     #define FSE_STATIC_LINKING_ONLY
     #include "fse.h"
     #include "error_private.h"
    -#define ZSTD_DEPS_NEED_MALLOC
    -#include "zstd_deps.h"
    +#include "zstd_deps.h"  /* ZSTD_memcpy */
     #include "bits.h"       /* ZSTD_highbit32 */
     
     
    @@ -84,7 +83,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
                         symbolNext[s] = 1;
                     } else {
                         if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
    -                    symbolNext[s] = normalizedCounter[s];
    +                    symbolNext[s] = (U16)normalizedCounter[s];
             }   }   }
             ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
         }
    @@ -99,8 +98,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
              * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
              * our buffer to handle the over-write.
              */
    -        {
    -            U64 const add = 0x0101010101010101ull;
    +        {   U64 const add = 0x0101010101010101ull;
                 size_t pos = 0;
                 U64 sv = 0;
                 U32 s;
    @@ -111,9 +109,8 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
                     for (i = 8; i < n; i += 8) {
                         MEM_write64(spread + pos + i, sv);
                     }
    -                pos += n;
    -            }
    -        }
    +                pos += (size_t)n;
    +        }   }
             /* Now we spread those positions across the table.
              * The benefit of doing it in two stages is that we avoid the
              * variable size inner loop, which caused lots of branch misses.
    @@ -232,12 +229,12 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
                 break;
         }   }
     
    -    return op-ostart;
    +    assert(op >= ostart);
    +    return (size_t)(op-ostart);
     }
     
     typedef struct {
         short ncount[FSE_MAX_SYMBOL_VALUE + 1];
    -    FSE_DTable dtable[1]; /* Dynamically sized */
     } FSE_DecompressWksp;
     
     
    @@ -252,13 +249,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
         unsigned tableLog;
         unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
         FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace;
    +    size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable);
    +    FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos;
     
    -    DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
    +    FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
         if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC);
     
    +    /* correct offset to dtable depends on this property */
    +    FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0);
    +
         /* normal FSE decoding mode */
    -    {
    -        size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
    +    {   size_t const NCountLength =
    +            FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
             if (FSE_isError(NCountLength)) return NCountLength;
             if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
             assert(NCountLength <= cSrcSize);
    @@ -271,16 +273,16 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
         workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
         wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
     
    -    CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
    +    CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
     
         {
    -        const void* ptr = wksp->dtable;
    +        const void* ptr = dtable;
             const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
             const U32 fastMode = DTableH->fastMode;
     
             /* select fast mode (static) */
    -        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1);
    -        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0);
    +        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1);
    +        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0);
         }
     }
     
    diff --git a/third-party/zstd/lib/common/huf.h b/third-party/zstd/lib/common/huf.h
    index 73d1ee56..99bf85d6 100644
    --- a/third-party/zstd/lib/common/huf.h
    +++ b/third-party/zstd/lib/common/huf.h
    @@ -197,9 +197,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
     
     /** HUF_getNbBitsFromCTable() :
      *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
    - *  Note 1 : is not inlined, as HUF_CElt definition is private */
    + *  Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0
    + *  Note 2 : is not inlined, as HUF_CElt definition is private
    + */
     U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
     
    +typedef struct {
    +    BYTE tableLog;
    +    BYTE maxSymbolValue;
    +    BYTE unused[sizeof(size_t) - 2];
    +} HUF_CTableHeader;
    +
    +/** HUF_readCTableHeader() :
    + *  @returns The header from the CTable specifying the tableLog and the maxSymbolValue.
    + */
    +HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable);
    +
     /*
      * HUF_decompress() does the following:
      * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
    diff --git a/third-party/zstd/lib/common/mem.h b/third-party/zstd/lib/common/mem.h
    index 98dd47a0..096f4be5 100644
    --- a/third-party/zstd/lib/common/mem.h
    +++ b/third-party/zstd/lib/common/mem.h
    @@ -31,15 +31,6 @@ extern "C" {
     #   include   /* _byteswap_ulong */
     #   include   /* _byteswap_* */
     #endif
    -#if defined(__GNUC__)
    -#  define MEM_STATIC static __inline __attribute__((unused))
    -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
    -#  define MEM_STATIC static inline
    -#elif defined(_MSC_VER)
    -#  define MEM_STATIC static __inline
    -#else
    -#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
    -#endif
     
     /*-**************************************************************
     *  Basic Types
    diff --git a/third-party/zstd/lib/common/pool.c b/third-party/zstd/lib/common/pool.c
    index d5ca5a78..3adcefc9 100644
    --- a/third-party/zstd/lib/common/pool.c
    +++ b/third-party/zstd/lib/common/pool.c
    @@ -223,7 +223,7 @@ static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads)
         {   ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_customCalloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem);
             if (!threadPool) return 1;
             /* replace existing thread pool */
    -        ZSTD_memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool));
    +        ZSTD_memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(ZSTD_pthread_t));
             ZSTD_customFree(ctx->threads, ctx->customMem);
             ctx->threads = threadPool;
             /* Initialize additional threads */
    diff --git a/third-party/zstd/lib/common/pool.h b/third-party/zstd/lib/common/pool.h
    index eb22ff50..cca4de73 100644
    --- a/third-party/zstd/lib/common/pool.h
    +++ b/third-party/zstd/lib/common/pool.h
    @@ -47,7 +47,7 @@ void POOL_joinJobs(POOL_ctx* ctx);
     /*! POOL_resize() :
      *  Expands or shrinks pool's number of threads.
      *  This is more efficient than releasing + creating a new context,
    - *  since it tries to preserve and re-use existing threads.
    + *  since it tries to preserve and reuse existing threads.
      * `numThreads` must be at least 1.
      * @return : 0 when resize was successful,
      *           !0 (typically 1) if there is an error.
    diff --git a/third-party/zstd/lib/common/portability_macros.h b/third-party/zstd/lib/common/portability_macros.h
    index 8fd6ea82..e50314a7 100644
    --- a/third-party/zstd/lib/common/portability_macros.h
    +++ b/third-party/zstd/lib/common/portability_macros.h
    @@ -68,6 +68,8 @@
     /* Mark the internal assembly functions as hidden  */
     #ifdef __ELF__
     # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func
    +#elif defined(__APPLE__)
    +# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func
     #else
     # define ZSTD_HIDE_ASM_FUNCTION(func)
     #endif
    diff --git a/third-party/zstd/lib/common/threading.c b/third-party/zstd/lib/common/threading.c
    index ca155b9b..25bb8b98 100644
    --- a/third-party/zstd/lib/common/threading.c
    +++ b/third-party/zstd/lib/common/threading.c
    @@ -73,10 +73,12 @@ int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
         ZSTD_thread_params_t thread_param;
         (void)unused;
     
    +    if (thread==NULL) return -1;
    +    *thread = NULL;
    +
         thread_param.start_routine = start_routine;
         thread_param.arg = arg;
         thread_param.initialized = 0;
    -    *thread = NULL;
     
         /* Setup thread initialization synchronization */
         if(ZSTD_pthread_cond_init(&thread_param.initialized_cond, NULL)) {
    @@ -91,7 +93,7 @@ int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
     
         /* Spawn thread */
         *thread = (HANDLE)_beginthreadex(NULL, 0, worker, &thread_param, 0, NULL);
    -    if (!thread) {
    +    if (*thread==NULL) {
             ZSTD_pthread_mutex_destroy(&thread_param.initialized_mutex);
             ZSTD_pthread_cond_destroy(&thread_param.initialized_cond);
             return errno;
    @@ -137,6 +139,7 @@ int ZSTD_pthread_join(ZSTD_pthread_t thread)
     
     int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr)
     {
    +    assert(mutex != NULL);
         *mutex = (pthread_mutex_t*)ZSTD_malloc(sizeof(pthread_mutex_t));
         if (!*mutex)
             return 1;
    @@ -145,6 +148,7 @@ int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t con
     
     int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex)
     {
    +    assert(mutex != NULL);
         if (!*mutex)
             return 0;
         {
    @@ -156,6 +160,7 @@ int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex)
     
     int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr)
     {
    +    assert(cond != NULL);
         *cond = (pthread_cond_t*)ZSTD_malloc(sizeof(pthread_cond_t));
         if (!*cond)
             return 1;
    @@ -164,6 +169,7 @@ int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const*
     
     int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond)
     {
    +    assert(cond != NULL);
         if (!*cond)
             return 0;
         {
    diff --git a/third-party/zstd/lib/common/xxhash.c b/third-party/zstd/lib/common/xxhash.c
    index fd237c90..052cd522 100644
    --- a/third-party/zstd/lib/common/xxhash.c
    +++ b/third-party/zstd/lib/common/xxhash.c
    @@ -1,24 +1,18 @@
     /*
    - *  xxHash - Fast Hash algorithm
    - *  Copyright (c) Meta Platforms, Inc. and affiliates.
    - *
    - *  You can contact the author at :
    - *  - xxHash homepage: https://cyan4973.github.io/xxHash/
    - *  - xxHash source repository : https://github.com/Cyan4973/xxHash
    + * xxHash - Extremely Fast Hash algorithm
    + * Copyright (c) Yann Collet - Meta Platforms, Inc
      *
      * This source code is licensed under both the BSD-style license (found in the
      * LICENSE file in the root directory of this source tree) and the GPLv2 (found
      * in the COPYING file in the root directory of this source tree).
      * You may select, at your option, one of the above-listed licenses.
    -*/
    -
    -
    + */
     
     /*
      * xxhash.c instantiates functions defined in xxhash.h
      */
     
    -#define XXH_STATIC_LINKING_ONLY   /* access advanced declarations */
    -#define XXH_IMPLEMENTATION   /* access definitions */
    +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
    +#define XXH_IMPLEMENTATION      /* access definitions */
     
     #include "xxhash.h"
    diff --git a/third-party/zstd/lib/common/xxhash.h b/third-party/zstd/lib/common/xxhash.h
    index b8b73290..e59e4426 100644
    --- a/third-party/zstd/lib/common/xxhash.h
    +++ b/third-party/zstd/lib/common/xxhash.h
    @@ -1,17 +1,15 @@
     /*
    - *  xxHash - Fast Hash algorithm
    - *  Copyright (c) Meta Platforms, Inc. and affiliates.
    - *
    - *  You can contact the author at :
    - *  - xxHash homepage: https://cyan4973.github.io/xxHash/
    - *  - xxHash source repository : https://github.com/Cyan4973/xxHash
    + * xxHash - Extremely Fast Hash algorithm
    + * Header File
    + * Copyright (c) Yann Collet - Meta Platforms, Inc
      *
      * This source code is licensed under both the BSD-style license (found in the
      * LICENSE file in the root directory of this source tree) and the GPLv2 (found
      * in the COPYING file in the root directory of this source tree).
      * You may select, at your option, one of the above-listed licenses.
    -*/
    + */
     
    +/* Local adaptations for Zstandard */
     
     #ifndef XXH_NO_XXH3
     # define XXH_NO_XXH3
    @@ -24,46 +22,210 @@
     /*!
      * @mainpage xxHash
      *
    + * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed
    + * limits.
    + *
    + * It is proposed in four flavors, in three families:
    + * 1. @ref XXH32_family
    + *   - Classic 32-bit hash function. Simple, compact, and runs on almost all
    + *     32-bit and 64-bit systems.
    + * 2. @ref XXH64_family
    + *   - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most
    + *     64-bit systems (but _not_ 32-bit systems).
    + * 3. @ref XXH3_family
    + *   - Modern 64-bit and 128-bit hash function family which features improved
    + *     strength and performance across the board, especially on smaller data.
    + *     It benefits greatly from SIMD and 64-bit without requiring it.
    + *
    + * Benchmarks
    + * ---
    + * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04.
    + * The open source benchmark program is compiled with clang v10.0 using -O3 flag.
    + *
    + * | Hash Name            | ISA ext | Width | Large Data Speed | Small Data Velocity |
    + * | -------------------- | ------- | ----: | ---------------: | ------------------: |
    + * | XXH3_64bits()        | @b AVX2 |    64 |        59.4 GB/s |               133.1 |
    + * | MeowHash             | AES-NI  |   128 |        58.2 GB/s |                52.5 |
    + * | XXH3_128bits()       | @b AVX2 |   128 |        57.9 GB/s |               118.1 |
    + * | CLHash               | PCLMUL  |    64 |        37.1 GB/s |                58.1 |
    + * | XXH3_64bits()        | @b SSE2 |    64 |        31.5 GB/s |               133.1 |
    + * | XXH3_128bits()       | @b SSE2 |   128 |        29.6 GB/s |               118.1 |
    + * | RAM sequential read  |         |   N/A |        28.0 GB/s |                 N/A |
    + * | ahash                | AES-NI  |    64 |        22.5 GB/s |               107.2 |
    + * | City64               |         |    64 |        22.0 GB/s |                76.6 |
    + * | T1ha2                |         |    64 |        22.0 GB/s |                99.0 |
    + * | City128              |         |   128 |        21.7 GB/s |                57.7 |
    + * | FarmHash             | AES-NI  |    64 |        21.3 GB/s |                71.9 |
    + * | XXH64()              |         |    64 |        19.4 GB/s |                71.0 |
    + * | SpookyHash           |         |    64 |        19.3 GB/s |                53.2 |
    + * | Mum                  |         |    64 |        18.0 GB/s |                67.0 |
    + * | CRC32C               | SSE4.2  |    32 |        13.0 GB/s |                57.9 |
    + * | XXH32()              |         |    32 |         9.7 GB/s |                71.9 |
    + * | City32               |         |    32 |         9.1 GB/s |                66.0 |
    + * | Blake3*              | @b AVX2 |   256 |         4.4 GB/s |                 8.1 |
    + * | Murmur3              |         |    32 |         3.9 GB/s |                56.1 |
    + * | SipHash*             |         |    64 |         3.0 GB/s |                43.2 |
    + * | Blake3*              | @b SSE2 |   256 |         2.4 GB/s |                 8.1 |
    + * | HighwayHash          |         |    64 |         1.4 GB/s |                 6.0 |
    + * | FNV64                |         |    64 |         1.2 GB/s |                62.7 |
    + * | Blake2*              |         |   256 |         1.1 GB/s |                 5.1 |
    + * | SHA1*                |         |   160 |         0.8 GB/s |                 5.6 |
    + * | MD5*                 |         |   128 |         0.6 GB/s |                 7.8 |
    + * @note
    + *   - Hashes which require a specific ISA extension are noted. SSE2 is also noted,
    + *     even though it is mandatory on x64.
    + *   - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic
    + *     by modern standards.
    + *   - Small data velocity is a rough average of algorithm's efficiency for small
    + *     data. For more accurate information, see the wiki.
    + *   - More benchmarks and strength tests are found on the wiki:
    + *         https://github.com/Cyan4973/xxHash/wiki
    + *
    + * Usage
    + * ------
    + * All xxHash variants use a similar API. Changing the algorithm is a trivial
    + * substitution.
    + *
    + * @pre
    + *    For functions which take an input and length parameter, the following
    + *    requirements are assumed:
    + *    - The range from [`input`, `input + length`) is valid, readable memory.
    + *      - The only exception is if the `length` is `0`, `input` may be `NULL`.
    + *    - For C++, the objects must have the *TriviallyCopyable* property, as the
    + *      functions access bytes directly as if it was an array of `unsigned char`.
    + *
    + * @anchor single_shot_example
    + * **Single Shot**
    + *
    + * These functions are stateless functions which hash a contiguous block of memory,
    + * immediately returning the result. They are the easiest and usually the fastest
    + * option.
    + *
    + * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits()
    + *
    + * @code{.c}
    + *   #include 
    + *   #include "xxhash.h"
    + *
    + *   // Example for a function which hashes a null terminated string with XXH32().
    + *   XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed)
    + *   {
    + *       // NULL pointers are only valid if the length is zero
    + *       size_t length = (string == NULL) ? 0 : strlen(string);
    + *       return XXH32(string, length, seed);
    + *   }
    + * @endcode
    + *
    + *
    + * @anchor streaming_example
    + * **Streaming**
    + *
    + * These groups of functions allow incremental hashing of unknown size, even
    + * more than what would fit in a size_t.
    + *
    + * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset()
    + *
    + * @code{.c}
    + *   #include 
    + *   #include 
    + *   #include "xxhash.h"
    + *   // Example for a function which hashes a FILE incrementally with XXH3_64bits().
    + *   XXH64_hash_t hashFile(FILE* f)
    + *   {
    + *       // Allocate a state struct. Do not just use malloc() or new.
    + *       XXH3_state_t* state = XXH3_createState();
    + *       assert(state != NULL && "Out of memory!");
    + *       // Reset the state to start a new hashing session.
    + *       XXH3_64bits_reset(state);
    + *       char buffer[4096];
    + *       size_t count;
    + *       // Read the file in chunks
    + *       while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) {
    + *           // Run update() as many times as necessary to process the data
    + *           XXH3_64bits_update(state, buffer, count);
    + *       }
    + *       // Retrieve the finalized hash. This will not change the state.
    + *       XXH64_hash_t result = XXH3_64bits_digest(state);
    + *       // Free the state. Do not use free().
    + *       XXH3_freeState(state);
    + *       return result;
    + *   }
    + * @endcode
    + *
    + * Streaming functions generate the xxHash value from an incremental input.
    + * This method is slower than single-call functions, due to state management.
    + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
    + *
    + * An XXH state must first be allocated using `XXH*_createState()`.
    + *
    + * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
    + *
    + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
    + *
    + * The function returns an error code, with 0 meaning OK, and any other value
    + * meaning there is an error.
    + *
    + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
    + * This function returns the nn-bits hash as an int or long long.
    + *
    + * It's still possible to continue inserting input into the hash state after a
    + * digest, and generate new hash values later on by invoking `XXH*_digest()`.
    + *
    + * When done, release the state using `XXH*_freeState()`.
    + *
    + *
    + * @anchor canonical_representation_example
    + * **Canonical Representation**
    + *
    + * The default return values from XXH functions are unsigned 32, 64 and 128 bit
    + * integers.
    + * This the simplest and fastest format for further post-processing.
    + *
    + * However, this leaves open the question of what is the order on the byte level,
    + * since little and big endian conventions will store the same number differently.
    + *
    + * The canonical representation settles this issue by mandating big-endian
    + * convention, the same convention as human-readable numbers (large digits first).
    + *
    + * When writing hash values to storage, sending them over a network, or printing
    + * them, it's highly recommended to use the canonical representation to ensure
    + * portability across a wider range of systems, present and future.
    + *
    + * The following functions allow transformation of hash values to and from
    + * canonical format.
    + *
    + * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(),
    + * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(),
    + * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(),
    + *
    + * @code{.c}
    + *   #include 
    + *   #include "xxhash.h"
    + *
    + *   // Example for a function which prints XXH32_hash_t in human readable format
    + *   void printXxh32(XXH32_hash_t hash)
    + *   {
    + *       XXH32_canonical_t cano;
    + *       XXH32_canonicalFromHash(&cano, hash);
    + *       size_t i;
    + *       for(i = 0; i < sizeof(cano.digest); ++i) {
    + *           printf("%02x", cano.digest[i]);
    + *       }
    + *       printf("\n");
    + *   }
    + *
    + *   // Example for a function which converts XXH32_canonical_t to XXH32_hash_t
    + *   XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano)
    + *   {
    + *       XXH32_hash_t hash = XXH32_hashFromCanonical(&cano);
    + *       return hash;
    + *   }
    + * @endcode
    + *
    + *
      * @file xxhash.h
      * xxHash prototypes and implementation
      */
    -/* TODO: update */
    -/* Notice extracted from xxHash homepage:
    -
    -xxHash is an extremely fast hash algorithm, running at RAM speed limits.
    -It also successfully passes all tests from the SMHasher suite.
    -
    -Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
    -
    -Name            Speed       Q.Score   Author
    -xxHash          5.4 GB/s     10
    -CrapWow         3.2 GB/s      2       Andrew
    -MurmurHash 3a   2.7 GB/s     10       Austin Appleby
    -SpookyHash      2.0 GB/s     10       Bob Jenkins
    -SBox            1.4 GB/s      9       Bret Mulvey
    -Lookup3         1.2 GB/s      9       Bob Jenkins
    -SuperFastHash   1.2 GB/s      1       Paul Hsieh
    -CityHash64      1.05 GB/s    10       Pike & Alakuijala
    -FNV             0.55 GB/s     5       Fowler, Noll, Vo
    -CRC32           0.43 GB/s     9
    -MD5-32          0.33 GB/s    10       Ronald L. Rivest
    -SHA1-32         0.28 GB/s    10
    -
    -Q.Score is a measure of quality of the hash function.
    -It depends on successfully passing SMHasher test set.
    -10 is a perfect score.
    -
    -Note: SMHasher's CRC32 implementation is not the fastest one.
    -Other speed-oriented implementations can be faster,
    -especially in combination with PCLMUL instruction:
    -https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
    -
    -A 64-bit version, named XXH64, is available since r35.
    -It offers much better speed, but for 64-bit applications only.
    -Name     Speed on 64 bits    Speed on 32 bits
    -XXH64       13.8 GB/s            1.9 GB/s
    -XXH32        6.8 GB/s            6.0 GB/s
    -*/
     
     #if defined (__cplusplus)
     extern "C" {
    @@ -73,21 +235,80 @@ extern "C" {
      *  INLINE mode
      ******************************/
     /*!
    - * XXH_INLINE_ALL (and XXH_PRIVATE_API)
    + * @defgroup public Public API
    + * Contains details on the public xxHash functions.
    + * @{
    + */
    +#ifdef XXH_DOXYGEN
    +/*!
    + * @brief Gives access to internal state declaration, required for static allocation.
    + *
    + * Incompatible with dynamic linking, due to risks of ABI changes.
    + *
    + * Usage:
    + * @code{.c}
    + *     #define XXH_STATIC_LINKING_ONLY
    + *     #include "xxhash.h"
    + * @endcode
    + */
    +#  define XXH_STATIC_LINKING_ONLY
    +/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */
    +
    +/*!
    + * @brief Gives access to internal definitions.
    + *
    + * Usage:
    + * @code{.c}
    + *     #define XXH_STATIC_LINKING_ONLY
    + *     #define XXH_IMPLEMENTATION
    + *     #include "xxhash.h"
    + * @endcode
    + */
    +#  define XXH_IMPLEMENTATION
    +/* Do not undef XXH_IMPLEMENTATION for Doxygen */
    +
    +/*!
    + * @brief Exposes the implementation and marks all functions as `inline`.
    + *
      * Use these build macros to inline xxhash into the target unit.
      * Inlining improves performance on small inputs, especially when the length is
      * expressed as a compile-time constant:
      *
    - *      https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
    + *  https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
      *
      * It also keeps xxHash symbols private to the unit, so they are not exported.
      *
      * Usage:
    + * @code{.c}
      *     #define XXH_INLINE_ALL
      *     #include "xxhash.h"
    - *
    + * @endcode
      * Do not compile and link xxhash.o as a separate object, as it is not useful.
      */
    +#  define XXH_INLINE_ALL
    +#  undef XXH_INLINE_ALL
    +/*!
    + * @brief Exposes the implementation without marking functions as inline.
    + */
    +#  define XXH_PRIVATE_API
    +#  undef XXH_PRIVATE_API
    +/*!
    + * @brief Emulate a namespace by transparently prefixing all symbols.
    + *
    + * If you want to include _and expose_ xxHash functions from within your own
    + * library, but also want to avoid symbol collisions with other libraries which
    + * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix
    + * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE
    + * (therefore, avoid empty or numeric values).
    + *
    + * Note that no change is required within the calling program as long as it
    + * includes `xxhash.h`: Regular symbol names will be automatically translated
    + * by this header.
    + */
    +#  define XXH_NAMESPACE /* YOUR NAME HERE */
    +#  undef XXH_NAMESPACE
    +#endif
    +
     #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
         && !defined(XXH_INLINE_ALL_31684351384)
        /* this section should be traversed only once */
    @@ -202,21 +423,13 @@ extern "C" {
     #  undef XXHASH_H_STATIC_13879238742
     #endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
     
    -
    -
     /* ****************************************************************
      *  Stable API
      *****************************************************************/
     #ifndef XXHASH_H_5627135585666179
     #define XXHASH_H_5627135585666179 1
     
    -
    -/*!
    - * @defgroup public Public API
    - * Contains details on the public xxHash functions.
    - * @{
    - */
    -/* specific declaration modes for Windows */
    +/*! @brief Marks a global symbol. */
     #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
     #  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
     #    ifdef XXH_EXPORT
    @@ -229,24 +442,6 @@ extern "C" {
     #  endif
     #endif
     
    -#ifdef XXH_DOXYGEN
    -/*!
    - * @brief Emulate a namespace by transparently prefixing all symbols.
    - *
    - * If you want to include _and expose_ xxHash functions from within your own
    - * library, but also want to avoid symbol collisions with other libraries which
    - * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
    - * any public symbol from xxhash library with the value of XXH_NAMESPACE
    - * (therefore, avoid empty or numeric values).
    - *
    - * Note that no change is required within the calling program as long as it
    - * includes `xxhash.h`: Regular symbol names will be automatically translated
    - * by this header.
    - */
    -#  define XXH_NAMESPACE /* YOUR NAME HERE */
    -#  undef XXH_NAMESPACE
    -#endif
    -
     #ifdef XXH_NAMESPACE
     #  define XXH_CAT(A,B) A##B
     #  define XXH_NAME2(A,B) XXH_CAT(A,B)
    @@ -306,12 +501,40 @@ extern "C" {
     #endif
     
     
    +/* *************************************
    +*  Compiler specifics
    +***************************************/
    +
    +/* specific declaration modes for Windows */
    +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
    +#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
    +#    ifdef XXH_EXPORT
    +#      define XXH_PUBLIC_API __declspec(dllexport)
    +#    elif XXH_IMPORT
    +#      define XXH_PUBLIC_API __declspec(dllimport)
    +#    endif
    +#  else
    +#    define XXH_PUBLIC_API   /* do nothing */
    +#  endif
    +#endif
    +
    +#if defined (__GNUC__)
    +# define XXH_CONSTF  __attribute__((const))
    +# define XXH_PUREF   __attribute__((pure))
    +# define XXH_MALLOCF __attribute__((malloc))
    +#else
    +# define XXH_CONSTF  /* disable */
    +# define XXH_PUREF
    +# define XXH_MALLOCF
    +#endif
    +
     /* *************************************
     *  Version
     ***************************************/
     #define XXH_VERSION_MAJOR    0
     #define XXH_VERSION_MINOR    8
    -#define XXH_VERSION_RELEASE  1
    +#define XXH_VERSION_RELEASE  2
    +/*! @brief Version number, encoded as two digits each */
     #define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
     
     /*!
    @@ -320,16 +543,22 @@ extern "C" {
      * This is mostly useful when xxHash is compiled as a shared library,
      * since the returned value comes from the library, as opposed to header file.
      *
    - * @return `XXH_VERSION_NUMBER` of the invoked library.
    + * @return @ref XXH_VERSION_NUMBER of the invoked library.
      */
    -XXH_PUBLIC_API unsigned XXH_versionNumber (void);
    +XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void);
     
     
     /* ****************************
     *  Common basic types
     ******************************/
     #include    /* size_t */
    -typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
    +/*!
    + * @brief Exit code for the streaming API.
    + */
    +typedef enum {
    +    XXH_OK = 0, /*!< OK */
    +    XXH_ERROR   /*!< Error */
    +} XXH_errorcode;
     
     
     /*-**********************************************************************
    @@ -346,44 +575,44 @@ typedef uint32_t XXH32_hash_t;
     #elif !defined (__VMS) \
       && (defined (__cplusplus) \
       || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
    -#   include 
    +#   ifdef _AIX
    +#     include 
    +#   else
    +#     include 
    +#   endif
         typedef uint32_t XXH32_hash_t;
     
     #else
     #   include 
     #   if UINT_MAX == 0xFFFFFFFFUL
           typedef unsigned int XXH32_hash_t;
    +#   elif ULONG_MAX == 0xFFFFFFFFUL
    +      typedef unsigned long XXH32_hash_t;
     #   else
    -#     if ULONG_MAX == 0xFFFFFFFFUL
    -        typedef unsigned long XXH32_hash_t;
    -#     else
    -#       error "unsupported platform: need a 32-bit type"
    -#     endif
    +#     error "unsupported platform: need a 32-bit type"
     #   endif
     #endif
     
     /*!
      * @}
      *
    - * @defgroup xxh32_family XXH32 family
    + * @defgroup XXH32_family XXH32 family
      * @ingroup public
      * Contains functions used in the classic 32-bit xxHash algorithm.
      *
      * @note
      *   XXH32 is useful for older platforms, with no or poor 64-bit performance.
    - *   Note that @ref xxh3_family provides competitive speed
    - *   for both 32-bit and 64-bit systems, and offers true 64/128 bit hash results.
    + *   Note that the @ref XXH3_family provides competitive speed for both 32-bit
    + *   and 64-bit systems, and offers true 64/128 bit hash results.
      *
    - * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
    - * @see @ref xxh32_impl for implementation details
    + * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families
    + * @see @ref XXH32_impl for implementation details
      * @{
      */
     
     /*!
      * @brief Calculates the 32-bit hash of @p input using xxHash32.
      *
    - * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
    - *
      * @param input The block of data to be hashed, at least @p length bytes in size.
      * @param length The length of @p input, in bytes.
      * @param seed The 32-bit seed to alter the hash's output predictably.
    @@ -393,66 +622,13 @@ typedef uint32_t XXH32_hash_t;
      *   readable, contiguous memory. However, if @p length is `0`, @p input may be
      *   `NULL`. In C++, this also must be *TriviallyCopyable*.
      *
    - * @return The calculated 32-bit hash value.
    + * @return The calculated 32-bit xxHash32 value.
      *
    - * @see
    - *    XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
    - *    Direct equivalents for the other variants of xxHash.
    - * @see
    - *    XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
    - */
    -XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
    -
    -/*!
    - * Streaming functions generate the xxHash value from an incremental input.
    - * This method is slower than single-call functions, due to state management.
    - * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
    - *
    - * An XXH state must first be allocated using `XXH*_createState()`.
    - *
    - * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
    - *
    - * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
    - *
    - * The function returns an error code, with 0 meaning OK, and any other value
    - * meaning there is an error.
    - *
    - * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
    - * This function returns the nn-bits hash as an int or long long.
    - *
    - * It's still possible to continue inserting input into the hash state after a
    - * digest, and generate new hash values later on by invoking `XXH*_digest()`.
    - *
    - * When done, release the state using `XXH*_freeState()`.
    - *
    - * Example code for incrementally hashing a file:
    - * @code{.c}
    - *    #include 
    - *    #include 
    - *    #define BUFFER_SIZE 256
    - *
    - *    // Note: XXH64 and XXH3 use the same interface.
    - *    XXH32_hash_t
    - *    hashFile(FILE* stream)
    - *    {
    - *        XXH32_state_t* state;
    - *        unsigned char buf[BUFFER_SIZE];
    - *        size_t amt;
    - *        XXH32_hash_t hash;
    - *
    - *        state = XXH32_createState();       // Create a state
    - *        assert(state != NULL);             // Error check here
    - *        XXH32_reset(state, 0xbaad5eed);    // Reset state with our seed
    - *        while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
    - *            XXH32_update(state, buf, amt); // Hash the file in chunks
    - *        }
    - *        hash = XXH32_digest(state);        // Finalize the hash
    - *        XXH32_freeState(state);            // Clean up
    - *        return hash;
    - *    }
    - * @endcode
    + * @see @ref single_shot_example "Single Shot Example" for an example.
      */
    +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
     
    +#ifndef XXH_NO_STREAM
     /*!
      * @typedef struct XXH32_state_s XXH32_state_t
      * @brief The opaque state struct for the XXH32 streaming API.
    @@ -464,16 +640,21 @@ typedef struct XXH32_state_s XXH32_state_t;
     /*!
      * @brief Allocates an @ref XXH32_state_t.
      *
    - * Must be freed with XXH32_freeState().
    - * @return An allocated XXH32_state_t on success, `NULL` on failure.
    + * @return An allocated pointer of @ref XXH32_state_t on success.
    + * @return `NULL` on failure.
    + *
    + * @note Must be freed with XXH32_freeState().
      */
    -XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
    +XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void);
     /*!
      * @brief Frees an @ref XXH32_state_t.
      *
    - * Must be allocated with XXH32_createState().
      * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
    - * @return XXH_OK.
    + *
    + * @return @ref XXH_OK.
    + *
    + * @note @p statePtr must be allocated with XXH32_createState().
    + *
      */
     XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
     /*!
    @@ -489,23 +670,22 @@ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_
     /*!
      * @brief Resets an @ref XXH32_state_t to begin a new hash.
      *
    - * This function resets and seeds a state. Call it before @ref XXH32_update().
    - *
      * @param statePtr The state struct to reset.
      * @param seed The 32-bit seed to alter the hash result predictably.
      *
      * @pre
      *   @p statePtr must not be `NULL`.
      *
    - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @note This function resets and seeds a state. Call it before @ref XXH32_update().
      */
     XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
     
     /*!
      * @brief Consumes a block of @p input to an @ref XXH32_state_t.
      *
    - * Call this to incrementally consume blocks of data.
    - *
      * @param statePtr The state struct to update.
      * @param input The block of data to be hashed, at least @p length bytes in size.
      * @param length The length of @p input, in bytes.
    @@ -517,47 +697,32 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t
      *   readable, contiguous memory. However, if @p length is `0`, @p input may be
      *   `NULL`. In C++, this also must be *TriviallyCopyable*.
      *
    - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @note Call this to incrementally consume blocks of data.
      */
     XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
     
     /*!
      * @brief Returns the calculated hash value from an @ref XXH32_state_t.
      *
    - * @note
    - *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
    - *   digest, and update again.
    - *
      * @param statePtr The state struct to calculate the hash from.
      *
      * @pre
      *  @p statePtr must not be `NULL`.
      *
    - * @return The calculated xxHash32 value from that state.
    + * @return The calculated 32-bit xxHash32 value from that state.
    + *
    + * @note
    + *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
    + *   digest, and update again.
      */
    -XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
    +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
    +#endif /* !XXH_NO_STREAM */
     
     /*******   Canonical representation   *******/
     
    -/*
    - * The default return values from XXH functions are unsigned 32 and 64 bit
    - * integers.
    - * This the simplest and fastest format for further post-processing.
    - *
    - * However, this leaves open the question of what is the order on the byte level,
    - * since little and big endian conventions will store the same number differently.
    - *
    - * The canonical representation settles this issue by mandating big-endian
    - * convention, the same convention as human-readable numbers (large digits first).
    - *
    - * When writing hash values to storage, sending them over a network, or printing
    - * them, it's highly recommended to use the canonical representation to ensure
    - * portability across a wider range of systems, present and future.
    - *
    - * The following functions allow transformation of hash values to and from
    - * canonical format.
    - */
    -
     /*!
      * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
      */
    @@ -568,11 +733,13 @@ typedef struct {
     /*!
      * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
      *
    - * @param dst The @ref XXH32_canonical_t pointer to be stored to.
    + * @param dst  The @ref XXH32_canonical_t pointer to be stored to.
      * @param hash The @ref XXH32_hash_t to be converted.
      *
      * @pre
      *   @p dst must not be `NULL`.
    + *
    + * @see @ref canonical_representation_example "Canonical Representation Example"
      */
     XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
     
    @@ -585,44 +752,75 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t
      *   @p src must not be `NULL`.
      *
      * @return The converted hash.
    + *
    + * @see @ref canonical_representation_example "Canonical Representation Example"
      */
    -XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
    +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
     
     
    +/*! @cond Doxygen ignores this part */
     #ifdef __has_attribute
     # define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
     #else
     # define XXH_HAS_ATTRIBUTE(x) 0
     #endif
    +/*! @endcond */
    +
    +/*! @cond Doxygen ignores this part */
    +/*
    + * C23 __STDC_VERSION__ number hasn't been specified yet. For now
    + * leave as `201711L` (C17 + 1).
    + * TODO: Update to correct value when its been specified.
    + */
    +#define XXH_C23_VN 201711L
    +/*! @endcond */
     
    +/*! @cond Doxygen ignores this part */
     /* C-language Attributes are added in C23. */
    -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
    +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute)
     # define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
     #else
     # define XXH_HAS_C_ATTRIBUTE(x) 0
     #endif
    +/*! @endcond */
     
    +/*! @cond Doxygen ignores this part */
     #if defined(__cplusplus) && defined(__has_cpp_attribute)
     # define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
     #else
     # define XXH_HAS_CPP_ATTRIBUTE(x) 0
     #endif
    +/*! @endcond */
     
    +/*! @cond Doxygen ignores this part */
     /*
    -Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
    -introduced in CPP17 and C23.
    -CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
    -C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
    -*/
    -#if XXH_HAS_C_ATTRIBUTE(x)
    -# define XXH_FALLTHROUGH [[fallthrough]]
    -#elif XXH_HAS_CPP_ATTRIBUTE(x)
    + * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
    + * introduced in CPP17 and C23.
    + * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
    + * C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
    + */
    +#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough)
     # define XXH_FALLTHROUGH [[fallthrough]]
     #elif XXH_HAS_ATTRIBUTE(__fallthrough__)
    -# define XXH_FALLTHROUGH __attribute__ ((fallthrough))
    +# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__))
    +#else
    +# define XXH_FALLTHROUGH /* fallthrough */
    +#endif
    +/*! @endcond */
    +
    +/*! @cond Doxygen ignores this part */
    +/*
    + * Define XXH_NOESCAPE for annotated pointers in public API.
    + * https://clang.llvm.org/docs/AttributeReference.html#noescape
    + * As of writing this, only supported by clang.
    + */
    +#if XXH_HAS_ATTRIBUTE(noescape)
    +# define XXH_NOESCAPE __attribute__((noescape))
     #else
    -# define XXH_FALLTHROUGH
    +# define XXH_NOESCAPE
     #endif
    +/*! @endcond */
    +
     
     /*!
      * @}
    @@ -644,7 +842,11 @@ typedef uint64_t XXH64_hash_t;
     #elif !defined (__VMS) \
       && (defined (__cplusplus) \
       || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
    -#  include 
    +#   ifdef _AIX
    +#     include 
    +#   else
    +#     include 
    +#   endif
        typedef uint64_t XXH64_hash_t;
     #else
     #  include 
    @@ -660,7 +862,7 @@ typedef uint64_t XXH64_hash_t;
     /*!
      * @}
      *
    - * @defgroup xxh64_family XXH64 family
    + * @defgroup XXH64_family XXH64 family
      * @ingroup public
      * @{
      * Contains functions used in the classic 64-bit xxHash algorithm.
    @@ -671,13 +873,9 @@ typedef uint64_t XXH64_hash_t;
      *   It provides better speed for systems with vector processing capabilities.
      */
     
    -
     /*!
      * @brief Calculates the 64-bit hash of @p input using xxHash64.
      *
    - * This function usually runs faster on 64-bit systems, but slower on 32-bit
    - * systems (see benchmark).
    - *
      * @param input The block of data to be hashed, at least @p length bytes in size.
      * @param length The length of @p input, in bytes.
      * @param seed The 64-bit seed to alter the hash's output predictably.
    @@ -687,41 +885,145 @@ typedef uint64_t XXH64_hash_t;
      *   readable, contiguous memory. However, if @p length is `0`, @p input may be
      *   `NULL`. In C++, this also must be *TriviallyCopyable*.
      *
    - * @return The calculated 64-bit hash.
    + * @return The calculated 64-bit xxHash64 value.
      *
    - * @see
    - *    XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
    - *    Direct equivalents for the other variants of xxHash.
    - * @see
    - *    XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
    + * @see @ref single_shot_example "Single Shot Example" for an example.
      */
    -XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
    +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
     
     /*******   Streaming   *******/
    +#ifndef XXH_NO_STREAM
     /*!
      * @brief The opaque state struct for the XXH64 streaming API.
      *
      * @see XXH64_state_s for details.
      */
     typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
    -XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
    -XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
    -XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
     
    -XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
    -XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
    -XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
    -
    -/*******   Canonical representation   *******/
    -typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
    -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
    -XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
    +/*!
    + * @brief Allocates an @ref XXH64_state_t.
    + *
    + * @return An allocated pointer of @ref XXH64_state_t on success.
    + * @return `NULL` on failure.
    + *
    + * @note Must be freed with XXH64_freeState().
    + */
    +XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);
    +
    +/*!
    + * @brief Frees an @ref XXH64_state_t.
    + *
    + * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState().
    + *
    + * @return @ref XXH_OK.
    + *
    + * @note @p statePtr must be allocated with XXH64_createState().
    + */
    +XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
    +
    +/*!
    + * @brief Copies one @ref XXH64_state_t to another.
    + *
    + * @param dst_state The state to copy to.
    + * @param src_state The state to copy from.
    + * @pre
    + *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
    + */
    +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);
    +
    +/*!
    + * @brief Resets an @ref XXH64_state_t to begin a new hash.
    + *
    + * @param statePtr The state struct to reset.
    + * @param seed The 64-bit seed to alter the hash result predictably.
    + *
    + * @pre
    + *   @p statePtr must not be `NULL`.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @note This function resets and seeds a state. Call it before @ref XXH64_update().
    + */
    +XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);
    +
    +/*!
    + * @brief Consumes a block of @p input to an @ref XXH64_state_t.
    + *
    + * @param statePtr The state struct to update.
    + * @param input The block of data to be hashed, at least @p length bytes in size.
    + * @param length The length of @p input, in bytes.
    + *
    + * @pre
    + *   @p statePtr must not be `NULL`.
    + * @pre
    + *   The memory between @p input and @p input + @p length must be valid,
    + *   readable, contiguous memory. However, if @p length is `0`, @p input may be
    + *   `NULL`. In C++, this also must be *TriviallyCopyable*.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @note Call this to incrementally consume blocks of data.
    + */
    +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
    +
    +/*!
    + * @brief Returns the calculated hash value from an @ref XXH64_state_t.
    + *
    + * @param statePtr The state struct to calculate the hash from.
    + *
    + * @pre
    + *  @p statePtr must not be `NULL`.
    + *
    + * @return The calculated 64-bit xxHash64 value from that state.
    + *
    + * @note
    + *   Calling XXH64_digest() will not affect @p statePtr, so you can update,
    + *   digest, and update again.
    + */
    +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);
    +#endif /* !XXH_NO_STREAM */
    +/*******   Canonical representation   *******/
    +
    +/*!
    + * @brief Canonical (big endian) representation of @ref XXH64_hash_t.
    + */
    +typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
    +
    +/*!
    + * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t.
    + *
    + * @param dst The @ref XXH64_canonical_t pointer to be stored to.
    + * @param hash The @ref XXH64_hash_t to be converted.
    + *
    + * @pre
    + *   @p dst must not be `NULL`.
    + *
    + * @see @ref canonical_representation_example "Canonical Representation Example"
    + */
    +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);
    +
    +/*!
    + * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t.
    + *
    + * @param src The @ref XXH64_canonical_t to convert.
    + *
    + * @pre
    + *   @p src must not be `NULL`.
    + *
    + * @return The converted hash.
    + *
    + * @see @ref canonical_representation_example "Canonical Representation Example"
    + */
    +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);
    +
    +#ifndef XXH_NO_XXH3
     
    -#ifndef XXH_NO_XXH3
     /*!
      * @}
      * ************************************************************************
    - * @defgroup xxh3_family XXH3 family
    + * @defgroup XXH3_family XXH3 family
      * @ingroup public
      * @{
      *
    @@ -741,16 +1043,26 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
      *
      * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
      * but does not require it.
    - * Any 32-bit and 64-bit targets that can run XXH32 smoothly
    - * can run XXH3 at competitive speeds, even without vector support.
    - * Further details are explained in the implementation.
    - *
    - * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
    - * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
    + * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3
    + * at competitive speeds, even without vector support. Further details are
    + * explained in the implementation.
    + *
    + * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD
    + * implementations for many common platforms:
    + *   - AVX512
    + *   - AVX2
    + *   - SSE2
    + *   - ARM NEON
    + *   - WebAssembly SIMD128
    + *   - POWER8 VSX
    + *   - s390x ZVector
    + * This can be controlled via the @ref XXH_VECTOR macro, but it automatically
    + * selects the best version according to predefined macros. For the x86 family, an
    + * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c.
      *
      * XXH3 implementation is portable:
      * it has a generic C90 formulation that can be compiled on any platform,
    - * all implementations generage exactly the same hash value on all platforms.
    + * all implementations generate exactly the same hash value on all platforms.
      * Starting from v0.8.0, it's also labelled "stable", meaning that
      * any future version will also generate the same hash value.
      *
    @@ -762,24 +1074,59 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
      *
      * The API supports one-shot hashing, streaming mode, and custom secrets.
      */
    -
     /*-**********************************************************************
     *  XXH3 64-bit variant
     ************************************************************************/
     
    -/* XXH3_64bits():
    - * default 64-bit variant, using default secret and default seed of 0.
    - * It's the fastest variant. */
    -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
    +/*!
    + * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input.
    + *
    + * @param input  The block of data to be hashed, at least @p length bytes in size.
    + * @param length The length of @p input, in bytes.
    + *
    + * @pre
    + *   The memory between @p input and @p input + @p length must be valid,
    + *   readable, contiguous memory. However, if @p length is `0`, @p input may be
    + *   `NULL`. In C++, this also must be *TriviallyCopyable*.
    + *
    + * @return The calculated 64-bit XXH3 hash value.
    + *
    + * @note
    + *   This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however
    + *   it may have slightly better performance due to constant propagation of the
    + *   defaults.
    + *
    + * @see
    + *    XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants
    + * @see @ref single_shot_example "Single Shot Example" for an example.
    + */
    +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);
     
    -/*
    - * XXH3_64bits_withSeed():
    - * This variant generates a custom secret on the fly
    - * based on default secret altered using the `seed` value.
    +/*!
    + * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input.
    + *
    + * @param input  The block of data to be hashed, at least @p length bytes in size.
    + * @param length The length of @p input, in bytes.
    + * @param seed   The 64-bit seed to alter the hash result predictably.
    + *
    + * @pre
    + *   The memory between @p input and @p input + @p length must be valid,
    + *   readable, contiguous memory. However, if @p length is `0`, @p input may be
    + *   `NULL`. In C++, this also must be *TriviallyCopyable*.
    + *
    + * @return The calculated 64-bit XXH3 hash value.
    + *
    + * @note
    + *    seed == 0 produces the same results as @ref XXH3_64bits().
    + *
    + * This variant generates a custom secret on the fly based on default secret
    + * altered using the @p seed value.
    + *
      * While this operation is decently fast, note that it's not completely free.
    - * Note: seed==0 produces the same results as XXH3_64bits().
    + *
    + * @see @ref single_shot_example "Single Shot Example" for an example.
      */
    -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
    +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
     
     /*!
      * The bare minimum size for a custom secret.
    @@ -790,27 +1137,43 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, X
      */
     #define XXH3_SECRET_SIZE_MIN 136
     
    -/*
    - * XXH3_64bits_withSecret():
    +/*!
    + * @brief Calculates 64-bit variant of XXH3 with a custom "secret".
    + *
    + * @param data       The block of data to be hashed, at least @p len bytes in size.
    + * @param len        The length of @p data, in bytes.
    + * @param secret     The secret data.
    + * @param secretSize The length of @p secret, in bytes.
    + *
    + * @return The calculated 64-bit XXH3 hash value.
    + *
    + * @pre
    + *   The memory between @p data and @p data + @p len must be valid,
    + *   readable, contiguous memory. However, if @p length is `0`, @p data may be
    + *   `NULL`. In C++, this also must be *TriviallyCopyable*.
    + *
      * It's possible to provide any blob of bytes as a "secret" to generate the hash.
      * This makes it more difficult for an external actor to prepare an intentional collision.
    - * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
    + * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
      * However, the quality of the secret impacts the dispersion of the hash algorithm.
      * Therefore, the secret _must_ look like a bunch of random bytes.
      * Avoid "trivial" or structured data such as repeated sequences or a text document.
      * Whenever in doubt about the "randomness" of the blob of bytes,
    - * consider employing "XXH3_generateSecret()" instead (see below).
    + * consider employing @ref XXH3_generateSecret() instead (see below).
      * It will generate a proper high entropy secret derived from the blob of bytes.
      * Another advantage of using XXH3_generateSecret() is that
      * it guarantees that all bits within the initial blob of bytes
      * will impact every bit of the output.
      * This is not necessarily the case when using the blob of bytes directly
      * because, when hashing _small_ inputs, only a portion of the secret is employed.
    + *
    + * @see @ref single_shot_example "Single Shot Example" for an example.
      */
    -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
    +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
     
     
     /*******   Streaming   *******/
    +#ifndef XXH_NO_STREAM
     /*
      * Streaming requires state maintenance.
      * This operation costs memory and CPU.
    @@ -819,40 +1182,124 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len,
      */
     
     /*!
    - * @brief The state struct for the XXH3 streaming API.
    + * @brief The opaque state struct for the XXH3 streaming API.
      *
      * @see XXH3_state_s for details.
      */
     typedef struct XXH3_state_s XXH3_state_t;
    -XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
    +XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
     XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
    -XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
     
    -/*
    - * XXH3_64bits_reset():
    - * Initialize with default parameters.
    - * digest will be equivalent to `XXH3_64bits()`.
    +/*!
    + * @brief Copies one @ref XXH3_state_t to another.
    + *
    + * @param dst_state The state to copy to.
    + * @param src_state The state to copy from.
    + * @pre
    + *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
      */
    -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
    -/*
    - * XXH3_64bits_reset_withSeed():
    - * Generate a custom secret from `seed`, and store it into `statePtr`.
    - * digest will be equivalent to `XXH3_64bits_withSeed()`.
    +XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);
    +
    +/*!
    + * @brief Resets an @ref XXH3_state_t to begin a new hash.
    + *
    + * @param statePtr The state struct to reset.
    + *
    + * @pre
    + *   @p statePtr must not be `NULL`.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @note
    + *   - This function resets `statePtr` and generate a secret with default parameters.
    + *   - Call this function before @ref XXH3_64bits_update().
    + *   - Digest will be equivalent to `XXH3_64bits()`.
    + *
      */
    -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
    -/*
    - * XXH3_64bits_reset_withSecret():
    - * `secret` is referenced, it _must outlive_ the hash streaming session.
    - * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
    +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
    +
    +/*!
    + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
    + *
    + * @param statePtr The state struct to reset.
    + * @param seed     The 64-bit seed to alter the hash result predictably.
    + *
    + * @pre
    + *   @p statePtr must not be `NULL`.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @note
    + *   - This function resets `statePtr` and generate a secret from `seed`.
    + *   - Call this function before @ref XXH3_64bits_update().
    + *   - Digest will be equivalent to `XXH3_64bits_withSeed()`.
    + *
    + */
    +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
    +
    +/*!
    + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
    + *
    + * @param statePtr The state struct to reset.
    + * @param secret     The secret data.
    + * @param secretSize The length of @p secret, in bytes.
    + *
    + * @pre
    + *   @p statePtr must not be `NULL`.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @note
    + *   `secret` is referenced, it _must outlive_ the hash streaming session.
    + *
    + * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
      * and the quality of produced hash values depends on secret's entropy
      * (secret's content should look like a bunch of random bytes).
      * When in doubt about the randomness of a candidate `secret`,
      * consider employing `XXH3_generateSecret()` instead (see below).
      */
    -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
    +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
     
    -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
    -XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
    +/*!
    + * @brief Consumes a block of @p input to an @ref XXH3_state_t.
    + *
    + * @param statePtr The state struct to update.
    + * @param input The block of data to be hashed, at least @p length bytes in size.
    + * @param length The length of @p input, in bytes.
    + *
    + * @pre
    + *   @p statePtr must not be `NULL`.
    + * @pre
    + *   The memory between @p input and @p input + @p length must be valid,
    + *   readable, contiguous memory. However, if @p length is `0`, @p input may be
    + *   `NULL`. In C++, this also must be *TriviallyCopyable*.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @note Call this to incrementally consume blocks of data.
    + */
    +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
    +
    +/*!
    + * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t.
    + *
    + * @param statePtr The state struct to calculate the hash from.
    + *
    + * @pre
    + *  @p statePtr must not be `NULL`.
    + *
    + * @return The calculated XXH3 64-bit hash value from that state.
    + *
    + * @note
    + *   Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update,
    + *   digest, and update again.
    + */
    +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t  XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
    +#endif /* !XXH_NO_STREAM */
     
     /* note : canonical representation of XXH3 is the same as XXH64
      * since they both produce XXH64_hash_t values */
    @@ -873,11 +1320,76 @@ typedef struct {
         XXH64_hash_t high64;  /*!< `value >> 64` */
     } XXH128_hash_t;
     
    -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
    -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
    -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
    +/*!
    + * @brief Calculates 128-bit unseeded variant of XXH3 of @p data.
    + *
    + * @param data The block of data to be hashed, at least @p length bytes in size.
    + * @param len  The length of @p data, in bytes.
    + *
    + * @return The calculated 128-bit variant of XXH3 value.
    + *
    + * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead
    + * for shorter inputs.
    + *
    + * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however
    + * it may have slightly better performance due to constant propagation of the
    + * defaults.
    + *
    + * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants
    + * @see @ref single_shot_example "Single Shot Example" for an example.
    + */
    +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);
    +/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
    + *
    + * @param data The block of data to be hashed, at least @p length bytes in size.
    + * @param len  The length of @p data, in bytes.
    + * @param seed The 64-bit seed to alter the hash result predictably.
    + *
    + * @return The calculated 128-bit variant of XXH3 value.
    + *
    + * @note
    + *    seed == 0 produces the same results as @ref XXH3_64bits().
    + *
    + * This variant generates a custom secret on the fly based on default secret
    + * altered using the @p seed value.
    + *
    + * While this operation is decently fast, note that it's not completely free.
    + *
    + * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants
    + * @see @ref single_shot_example "Single Shot Example" for an example.
    + */
    +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
    +/*!
    + * @brief Calculates 128-bit variant of XXH3 with a custom "secret".
    + *
    + * @param data       The block of data to be hashed, at least @p len bytes in size.
    + * @param len        The length of @p data, in bytes.
    + * @param secret     The secret data.
    + * @param secretSize The length of @p secret, in bytes.
    + *
    + * @return The calculated 128-bit variant of XXH3 value.
    + *
    + * It's possible to provide any blob of bytes as a "secret" to generate the hash.
    + * This makes it more difficult for an external actor to prepare an intentional collision.
    + * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
    + * However, the quality of the secret impacts the dispersion of the hash algorithm.
    + * Therefore, the secret _must_ look like a bunch of random bytes.
    + * Avoid "trivial" or structured data such as repeated sequences or a text document.
    + * Whenever in doubt about the "randomness" of the blob of bytes,
    + * consider employing @ref XXH3_generateSecret() instead (see below).
    + * It will generate a proper high entropy secret derived from the blob of bytes.
    + * Another advantage of using XXH3_generateSecret() is that
    + * it guarantees that all bits within the initial blob of bytes
    + * will impact every bit of the output.
    + * This is not necessarily the case when using the blob of bytes directly
    + * because, when hashing _small_ inputs, only a portion of the secret is employed.
    + *
    + * @see @ref single_shot_example "Single Shot Example" for an example.
    + */
    +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
     
     /*******   Streaming   *******/
    +#ifndef XXH_NO_STREAM
     /*
      * Streaming requires state maintenance.
      * This operation costs memory and CPU.
    @@ -887,42 +1399,166 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t le
      * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
      * Use already declared XXH3_createState() and XXH3_freeState().
      *
    - * All reset and streaming functions have same meaning as their 64-bit counterpart.
    + * All reset and streaming functions have same meaning as their 64-bit counterpart.
    + */
    +
    +/*!
    + * @brief Resets an @ref XXH3_state_t to begin a new hash.
    + *
    + * @param statePtr The state struct to reset.
    + *
    + * @pre
    + *   @p statePtr must not be `NULL`.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @note
    + *   - This function resets `statePtr` and generate a secret with default parameters.
    + *   - Call it before @ref XXH3_128bits_update().
    + *   - Digest will be equivalent to `XXH3_128bits()`.
    + */
    +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
    +
    +/*!
    + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
    + *
    + * @param statePtr The state struct to reset.
    + * @param seed     The 64-bit seed to alter the hash result predictably.
    + *
    + * @pre
    + *   @p statePtr must not be `NULL`.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @note
    + *   - This function resets `statePtr` and generate a secret from `seed`.
    + *   - Call it before @ref XXH3_128bits_update().
    + *   - Digest will be equivalent to `XXH3_128bits_withSeed()`.
    + */
    +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
    +/*!
    + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
    + *
    + * @param statePtr   The state struct to reset.
    + * @param secret     The secret data.
    + * @param secretSize The length of @p secret, in bytes.
    + *
    + * @pre
    + *   @p statePtr must not be `NULL`.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * `secret` is referenced, it _must outlive_ the hash streaming session.
    + * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
    + * and the quality of produced hash values depends on secret's entropy
    + * (secret's content should look like a bunch of random bytes).
    + * When in doubt about the randomness of a candidate `secret`,
    + * consider employing `XXH3_generateSecret()` instead (see below).
    + */
    +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
    +
    +/*!
    + * @brief Consumes a block of @p input to an @ref XXH3_state_t.
    + *
    + * Call this to incrementally consume blocks of data.
    + *
    + * @param statePtr The state struct to update.
    + * @param input The block of data to be hashed, at least @p length bytes in size.
    + * @param length The length of @p input, in bytes.
    + *
    + * @pre
    + *   @p statePtr must not be `NULL`.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @note
    + *   The memory between @p input and @p input + @p length must be valid,
    + *   readable, contiguous memory. However, if @p length is `0`, @p input may be
    + *   `NULL`. In C++, this also must be *TriviallyCopyable*.
    + *
    + */
    +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
    +
    +/*!
    + * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t.
    + *
    + * @param statePtr The state struct to calculate the hash from.
    + *
    + * @pre
    + *  @p statePtr must not be `NULL`.
    + *
    + * @return The calculated XXH3 128-bit hash value from that state.
    + *
    + * @note
    + *   Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update,
    + *   digest, and update again.
    + *
      */
    -
    -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
    -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
    -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
    -
    -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
    -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
    +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
    +#endif /* !XXH_NO_STREAM */
     
     /* Following helper functions make it possible to compare XXH128_hast_t values.
      * Since XXH128_hash_t is a structure, this capability is not offered by the language.
      * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
     
     /*!
    - * XXH128_isEqual():
    - * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
    + * @brief Check equality of two XXH128_hash_t values
    + *
    + * @param h1 The 128-bit hash value.
    + * @param h2 Another 128-bit hash value.
    + *
    + * @return `1` if `h1` and `h2` are equal.
    + * @return `0` if they are not.
      */
    -XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
    +XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
     
     /*!
    - * XXH128_cmp():
    + * @brief Compares two @ref XXH128_hash_t
      *
      * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
      *
    - * return: >0 if *h128_1  > *h128_2
    - *         =0 if *h128_1 == *h128_2
    - *         <0 if *h128_1  < *h128_2
    + * @param h128_1 Left-hand side value
    + * @param h128_2 Right-hand side value
    + *
    + * @return >0 if @p h128_1  > @p h128_2
    + * @return =0 if @p h128_1 == @p h128_2
    + * @return <0 if @p h128_1  < @p h128_2
      */
    -XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
    +XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);
     
     
     /*******   Canonical representation   *******/
     typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
    -XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
    -XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
    +
    +
    +/*!
    + * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t.
    + *
    + * @param dst  The @ref XXH128_canonical_t pointer to be stored to.
    + * @param hash The @ref XXH128_hash_t to be converted.
    + *
    + * @pre
    + *   @p dst must not be `NULL`.
    + * @see @ref canonical_representation_example "Canonical Representation Example"
    + */
    +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);
    +
    +/*!
    + * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t.
    + *
    + * @param src The @ref XXH128_canonical_t to convert.
    + *
    + * @pre
    + *   @p src must not be `NULL`.
    + *
    + * @return The converted hash.
    + * @see @ref canonical_representation_example "Canonical Representation Example"
    + */
    +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);
     
     
     #endif  /* !XXH_NO_XXH3 */
    @@ -996,7 +1632,6 @@ struct XXH64_state_s {
        XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it. */
     };   /* typedef'd to XXH64_state_t */
     
    -
     #ifndef XXH_NO_XXH3
     
     #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
    @@ -1032,6 +1667,7 @@ struct XXH64_state_s {
     #define XXH3_INTERNALBUFFER_SIZE 256
     
     /*!
    + * @internal
      * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
      *
      * This is the size used in @ref XXH3_kSecret and the seeded functions.
    @@ -1064,7 +1700,7 @@ struct XXH64_state_s {
      */
     struct XXH3_state_s {
        XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
    -       /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */
    +       /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */
        XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
            /*!< Used to store a custom secret generated from a seed. */
        XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
    @@ -1104,69 +1740,148 @@ struct XXH3_state_s {
      * Note that this doesn't prepare the state for a streaming operation,
      * it's still necessary to use XXH3_NNbits_reset*() afterwards.
      */
    -#define XXH3_INITSTATE(XXH3_state_ptr)   { (XXH3_state_ptr)->seed = 0; }
    +#define XXH3_INITSTATE(XXH3_state_ptr)                       \
    +    do {                                                     \
    +        XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \
    +        tmp_xxh3_state_ptr->seed = 0;                        \
    +        tmp_xxh3_state_ptr->extSecret = NULL;                \
    +    } while(0)
     
     
    -/* XXH128() :
    - * simple alias to pre-selected XXH3_128bits variant
    +/*!
    + * @brief Calculates the 128-bit hash of @p data using XXH3.
    + *
    + * @param data The block of data to be hashed, at least @p len bytes in size.
    + * @param len  The length of @p data, in bytes.
    + * @param seed The 64-bit seed to alter the hash's output predictably.
    + *
    + * @pre
    + *   The memory between @p data and @p data + @p len must be valid,
    + *   readable, contiguous memory. However, if @p len is `0`, @p data may be
    + *   `NULL`. In C++, this also must be *TriviallyCopyable*.
    + *
    + * @return The calculated 128-bit XXH3 value.
    + *
    + * @see @ref single_shot_example "Single Shot Example" for an example.
      */
    -XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
    +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
     
     
     /* ===   Experimental API   === */
     /* Symbols defined below must be considered tied to a specific library version. */
     
    -/*
    - * XXH3_generateSecret():
    +/*!
    + * @brief Derive a high-entropy secret from any user-defined content, named customSeed.
    + *
    + * @param secretBuffer    A writable buffer for derived high-entropy secret data.
    + * @param secretSize      Size of secretBuffer, in bytes.  Must be >= XXH3_SECRET_DEFAULT_SIZE.
    + * @param customSeed      A user-defined content.
    + * @param customSeedSize  Size of customSeed, in bytes.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
      *
    - * Derive a high-entropy secret from any user-defined content, named customSeed.
      * The generated secret can be used in combination with `*_withSecret()` functions.
    - * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,
    - * as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
    + * The `_withSecret()` variants are useful to provide a higher level of protection
    + * than 64-bit seed, as it becomes much more difficult for an external actor to
    + * guess how to impact the calculation logic.
      *
      * The function accepts as input a custom seed of any length and any content,
    - * and derives from it a high-entropy secret of length @secretSize
    - * into an already allocated buffer @secretBuffer.
    - * @secretSize must be >= XXH3_SECRET_SIZE_MIN
    + * and derives from it a high-entropy secret of length @p secretSize into an
    + * already allocated buffer @p secretBuffer.
      *
      * The generated secret can then be used with any `*_withSecret()` variant.
    - * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
    - * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
    + * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(),
    + * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret()
      * are part of this list. They all accept a `secret` parameter
    - * which must be large enough for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
    + * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN)
      * _and_ feature very high entropy (consist of random-looking bytes).
    - * These conditions can be a high bar to meet, so
    - * XXH3_generateSecret() can be employed to ensure proper quality.
    + * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can
    + * be employed to ensure proper quality.
      *
    - * customSeed can be anything. It can have any size, even small ones,
    - * and its content can be anything, even "poor entropy" sources such as a bunch of zeroes.
    - * The resulting `secret` will nonetheless provide all required qualities.
    + * @p customSeed can be anything. It can have any size, even small ones,
    + * and its content can be anything, even "poor entropy" sources such as a bunch
    + * of zeroes. The resulting `secret` will nonetheless provide all required qualities.
      *
    - * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
    + * @pre
    + *   - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN
    + *   - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
    + *
    + * Example code:
    + * @code{.c}
    + *    #include 
    + *    #include 
    + *    #include 
    + *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
    + *    #include "xxhash.h"
    + *    // Hashes argv[2] using the entropy from argv[1].
    + *    int main(int argc, char* argv[])
    + *    {
    + *        char secret[XXH3_SECRET_SIZE_MIN];
    + *        if (argv != 3) { return 1; }
    + *        XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1]));
    + *        XXH64_hash_t h = XXH3_64bits_withSecret(
    + *             argv[2], strlen(argv[2]),
    + *             secret, sizeof(secret)
    + *        );
    + *        printf("%016llx\n", (unsigned long long) h);
    + *    }
    + * @endcode
      */
    -XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);
    -
    +XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);
     
    -/*
    - * XXH3_generateSecret_fromSeed():
    - *
    - * Generate the same secret as the _withSeed() variants.
    +/*!
    + * @brief Generate the same secret as the _withSeed() variants.
      *
    - * The resulting secret has a length of XXH3_SECRET_DEFAULT_SIZE (necessarily).
    - * @secretBuffer must be already allocated, of size at least XXH3_SECRET_DEFAULT_SIZE bytes.
    + * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes
    + * @param seed         The 64-bit seed to alter the hash result predictably.
      *
      * The generated secret can be used in combination with
      *`*_withSecret()` and `_withSecretandSeed()` variants.
    - * This generator is notably useful in combination with `_withSecretandSeed()`,
    - * as a way to emulate a faster `_withSeed()` variant.
    + *
    + * Example C++ `std::string` hash class:
    + * @code{.cpp}
    + *    #include 
    + *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
    + *    #include "xxhash.h"
    + *    // Slow, seeds each time
    + *    class HashSlow {
    + *        XXH64_hash_t seed;
    + *    public:
    + *        HashSlow(XXH64_hash_t s) : seed{s} {}
    + *        size_t operator()(const std::string& x) const {
    + *            return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)};
    + *        }
    + *    };
    + *    // Fast, caches the seeded secret for future uses.
    + *    class HashFast {
    + *        unsigned char secret[XXH3_SECRET_SIZE_MIN];
    + *    public:
    + *        HashFast(XXH64_hash_t s) {
    + *            XXH3_generateSecret_fromSeed(secret, seed);
    + *        }
    + *        size_t operator()(const std::string& x) const {
    + *            return size_t{
    + *                XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret))
    + *            };
    + *        }
    + *    };
    + * @endcode
      */
    -XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);
    +XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);
     
    -/*
    - * *_withSecretandSeed() :
    +/*!
    + * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data.
    + *
    + * @param data       The block of data to be hashed, at least @p len bytes in size.
    + * @param len        The length of @p data, in bytes.
    + * @param secret     The secret data.
    + * @param secretSize The length of @p secret, in bytes.
    + * @param seed       The 64-bit seed to alter the hash result predictably.
    + *
      * These variants generate hash values using either
    - * @seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes)
    - * or @secret for "large" keys (>= XXH3_MIDSIZE_MAX).
    + * @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes)
    + * or @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX).
      *
      * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
      * `_withSeed()` has to generate the secret on the fly for "large" keys.
    @@ -1175,7 +1890,7 @@ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_
      * which requires more instructions than _withSeed() variants.
      * Therefore, _withSecretandSeed variant combines the best of both worlds.
      *
    - * When @secret has been generated by XXH3_generateSecret_fromSeed(),
    + * When @p secret has been generated by XXH3_generateSecret_fromSeed(),
      * this variant produces *exactly* the same results as `_withSeed()` variant,
      * hence offering only a pure speed benefit on "large" input,
      * by skipping the need to regenerate the secret for every large input.
    @@ -1184,33 +1899,71 @@ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_
      * for example with XXH3_64bits(), which then becomes the seed,
      * and then employ both the seed and the secret in _withSecretandSeed().
      * On top of speed, an added benefit is that each bit in the secret
    - * has a 50% chance to swap each bit in the output,
    - * via its impact to the seed.
    + * has a 50% chance to swap each bit in the output, via its impact to the seed.
    + *
      * This is not guaranteed when using the secret directly in "small data" scenarios,
      * because only portions of the secret are employed for small data.
      */
    -XXH_PUBLIC_API XXH64_hash_t
    -XXH3_64bits_withSecretandSeed(const void* data, size_t len,
    -                              const void* secret, size_t secretSize,
    +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
    +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,
    +                              XXH_NOESCAPE const void* secret, size_t secretSize,
                                   XXH64_hash_t seed);
    -
    -XXH_PUBLIC_API XXH128_hash_t
    -XXH3_128bits_withSecretandSeed(const void* data, size_t len,
    -                               const void* secret, size_t secretSize,
    +/*!
    + * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
    + *
    + * @param input      The block of data to be hashed, at least @p len bytes in size.
    + * @param length     The length of @p data, in bytes.
    + * @param secret     The secret data.
    + * @param secretSize The length of @p secret, in bytes.
    + * @param seed64     The 64-bit seed to alter the hash result predictably.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @see XXH3_64bits_withSecretandSeed()
    + */
    +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
    +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,
    +                               XXH_NOESCAPE const void* secret, size_t secretSize,
                                    XXH64_hash_t seed64);
    -
    +#ifndef XXH_NO_STREAM
    +/*!
    + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
    + *
    + * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
    + * @param secret     The secret data.
    + * @param secretSize The length of @p secret, in bytes.
    + * @param seed64     The 64-bit seed to alter the hash result predictably.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @see XXH3_64bits_withSecretandSeed()
    + */
     XXH_PUBLIC_API XXH_errorcode
    -XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
    -                                    const void* secret, size_t secretSize,
    +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
    +                                    XXH_NOESCAPE const void* secret, size_t secretSize,
                                         XXH64_hash_t seed64);
    -
    +/*!
    + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
    + *
    + * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
    + * @param secret     The secret data.
    + * @param secretSize The length of @p secret, in bytes.
    + * @param seed64     The 64-bit seed to alter the hash result predictably.
    + *
    + * @return @ref XXH_OK on success.
    + * @return @ref XXH_ERROR on failure.
    + *
    + * @see XXH3_64bits_withSecretandSeed()
    + */
     XXH_PUBLIC_API XXH_errorcode
    -XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
    -                                     const void* secret, size_t secretSize,
    +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
    +                                     XXH_NOESCAPE const void* secret, size_t secretSize,
                                          XXH64_hash_t seed64);
    +#endif /* !XXH_NO_STREAM */
     
    -
    -#endif  /* XXH_NO_XXH3 */
    +#endif  /* !XXH_NO_XXH3 */
     #endif  /* XXH_NO_LONG_LONG */
     #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
     #  define XXH_IMPLEMENTATION
    @@ -1264,7 +2017,7 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
     /*!
      * @brief Define this to disable 64-bit code.
      *
    - * Useful if only using the @ref xxh32_family and you have a strict C90 compiler.
    + * Useful if only using the @ref XXH32_family and you have a strict C90 compiler.
      */
     #  define XXH_NO_LONG_LONG
     #  undef XXH_NO_LONG_LONG /* don't actually */
    @@ -1287,7 +2040,7 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
      *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
      *     eliminate the function call and treat it as an unaligned access.
      *
    - *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
    + *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))`
      *   @par
      *     Depends on compiler extensions and is therefore not portable.
      *     This method is safe _if_ your compiler supports it,
    @@ -1307,7 +2060,7 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
      *     inline small `memcpy()` calls, and it might also be faster on big-endian
      *     systems which lack a native byteswap instruction. However, some compilers
      *     will emit literal byteshifts even if the target supports unaligned access.
    - *  .
    + *
      *
      * @warning
      *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
    @@ -1320,6 +2073,34 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
      */
     #  define XXH_FORCE_MEMORY_ACCESS 0
     
    +/*!
    + * @def XXH_SIZE_OPT
    + * @brief Controls how much xxHash optimizes for size.
    + *
    + * xxHash, when compiled, tends to result in a rather large binary size. This
    + * is mostly due to heavy usage to forced inlining and constant folding of the
    + * @ref XXH3_family to increase performance.
    + *
    + * However, some developers prefer size over speed. This option can
    + * significantly reduce the size of the generated code. When using the `-Os`
    + * or `-Oz` options on GCC or Clang, this is defined to 1 by default,
    + * otherwise it is defined to 0.
    + *
    + * Most of these size optimizations can be controlled manually.
    + *
    + * This is a number from 0-2.
    + *  - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed
    + *    comes first.
    + *  - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more
    + *    conservative and disables hacks that increase code size. It implies the
    + *    options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0,
    + *    and @ref XXH3_NEON_LANES == 8 if they are not already defined.
    + *  - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible.
    + *    Performance may cry. For example, the single shot functions just use the
    + *    streaming API.
    + */
    +#  define XXH_SIZE_OPT 0
    +
     /*!
      * @def XXH_FORCE_ALIGN_CHECK
      * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
    @@ -1341,9 +2122,11 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
      *
      * In these cases, the alignment check can be removed by setting this macro to 0.
      * Then the code will always use unaligned memory access.
    - * Align check is automatically disabled on x86, x64 & arm64,
    + * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips
      * which are platforms known to offer good unaligned memory accesses performance.
      *
    + * It is also disabled by default when @ref XXH_SIZE_OPT >= 1.
    + *
      * This option does not affect XXH3 (only XXH32 and XXH64).
      */
     #  define XXH_FORCE_ALIGN_CHECK 0
    @@ -1365,11 +2148,28 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
      * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
      * compiler full control on whether to inline or not.
      *
    - * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
    - * -fno-inline with GCC or Clang, this will automatically be defined.
    + * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if
    + * @ref XXH_SIZE_OPT >= 1, this will automatically be defined.
      */
     #  define XXH_NO_INLINE_HINTS 0
     
    +/*!
    + * @def XXH3_INLINE_SECRET
    + * @brief Determines whether to inline the XXH3 withSecret code.
    + *
    + * When the secret size is known, the compiler can improve the performance
    + * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret().
    + *
    + * However, if the secret size is not known, it doesn't have any benefit. This
    + * happens when xxHash is compiled into a global symbol. Therefore, if
    + * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0.
    + *
    + * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers
    + * that are *sometimes* force inline on -Og, and it is impossible to automatically
    + * detect this optimization level.
    + */
    +#  define XXH3_INLINE_SECRET 0
    +
     /*!
      * @def XXH32_ENDJMP
      * @brief Whether to use a jump for `XXH32_finalize`.
    @@ -1391,34 +2191,45 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
      */
     #  define XXH_OLD_NAMES
     #  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
    +
    +/*!
    + * @def XXH_NO_STREAM
    + * @brief Disables the streaming API.
    + *
    + * When xxHash is not inlined and the streaming functions are not used, disabling
    + * the streaming functions can improve code size significantly, especially with
    + * the @ref XXH3_family which tends to make constant folded copies of itself.
    + */
    +#  define XXH_NO_STREAM
    +#  undef XXH_NO_STREAM /* don't actually */
     #endif /* XXH_DOXYGEN */
     /*!
      * @}
      */
     
     #ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
    -   /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */
    -#  if !defined(__clang__) && \
    -( \
    -    (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
    -    ( \
    -        defined(__GNUC__) && ( \
    -            (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \
    -            ( \
    -                defined(__mips__) && \
    -                (__mips <= 5 || __mips_isa_rev < 6) && \
    -                (!defined(__mips16) || defined(__mips_mips16e2)) \
    -            ) \
    -        ) \
    -    ) \
    -)
    +   /* prefer __packed__ structures (method 1) for GCC
    +    * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy
    +    * which for some reason does unaligned loads. */
    +#  if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED))
     #    define XXH_FORCE_MEMORY_ACCESS 1
     #  endif
     #endif
     
    +#ifndef XXH_SIZE_OPT
    +   /* default to 1 for -Os or -Oz */
    +#  if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__)
    +#    define XXH_SIZE_OPT 1
    +#  else
    +#    define XXH_SIZE_OPT 0
    +#  endif
    +#endif
    +
     #ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
    -#  if defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) \
    -   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64) /* visual */
    +   /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */
    +#  if XXH_SIZE_OPT >= 1 || \
    +      defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \
    +   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64)    || defined(_M_ARM) /* visual */
     #    define XXH_FORCE_ALIGN_CHECK 0
     #  else
     #    define XXH_FORCE_ALIGN_CHECK 1
    @@ -1426,14 +2237,22 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
     #endif
     
     #ifndef XXH_NO_INLINE_HINTS
    -#  if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
    -   || defined(__NO_INLINE__)     /* -O0, -fno-inline */
    +#  if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__)  /* -O0, -fno-inline */
     #    define XXH_NO_INLINE_HINTS 1
     #  else
     #    define XXH_NO_INLINE_HINTS 0
     #  endif
     #endif
     
    +#ifndef XXH3_INLINE_SECRET
    +#  if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \
    +     || !defined(XXH_INLINE_ALL)
    +#    define XXH3_INLINE_SECRET 0
    +#  else
    +#    define XXH3_INLINE_SECRET 1
    +#  endif
    +#endif
    +
     #ifndef XXH32_ENDJMP
     /* generally preferable for performance */
     #  define XXH32_ENDJMP 0
    @@ -1448,13 +2267,56 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
     /* *************************************
     *  Includes & Memory related functions
     ***************************************/
    -/* Modify the local functions below should you wish to use some other memory routines */
    -/* for ZSTD_malloc(), ZSTD_free() */
    -#define ZSTD_DEPS_NEED_MALLOC
    -#include "zstd_deps.h"  /* size_t, ZSTD_malloc, ZSTD_free, ZSTD_memcpy */
    -static void* XXH_malloc(size_t s) { return ZSTD_malloc(s); }
    -static void  XXH_free  (void* p)  { ZSTD_free(p); }
    -static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_memcpy(dest,src,size); }
    +#if defined(XXH_NO_STREAM)
    +/* nothing */
    +#elif defined(XXH_NO_STDLIB)
    +
    +/* When requesting to disable any mention of stdlib,
    + * the library loses the ability to invoked malloc / free.
    + * In practice, it means that functions like `XXH*_createState()`
    + * will always fail, and return NULL.
    + * This flag is useful in situations where
    + * xxhash.h is integrated into some kernel, embedded or limited environment
    + * without access to dynamic allocation.
    + */
    +
    +static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; }
    +static void XXH_free(void* p) { (void)p; }
    +
    +#else
    +
    +/*
    + * Modify the local functions below should you wish to use
    + * different memory routines for malloc() and free()
    + */
    +#include 
    +
    +/*!
    + * @internal
    + * @brief Modify this function to use a different routine than malloc().
    + */
    +static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); }
    +
    +/*!
    + * @internal
    + * @brief Modify this function to use a different routine than free().
    + */
    +static void XXH_free(void* p) { free(p); }
    +
    +#endif  /* XXH_NO_STDLIB */
    +
    +#include 
    +
    +/*!
    + * @internal
    + * @brief Modify this function to use a different routine than memcpy().
    + */
    +static void* XXH_memcpy(void* dest, const void* src, size_t size)
    +{
    +    return memcpy(dest,src,size);
    +}
    +
    +#include    /* ULLONG_MAX */
     
     
     /* *************************************
    @@ -1487,6 +2349,11 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_
     #  define XXH_NO_INLINE static
     #endif
     
    +#if XXH3_INLINE_SECRET
    +#  define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE
    +#else
    +#  define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE
    +#endif
     
     
     /* *************************************
    @@ -1512,14 +2379,17 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_
     #  include    /* note: can still be disabled with NDEBUG */
     #  define XXH_ASSERT(c)   assert(c)
     #else
    -#  define XXH_ASSERT(c)   ((void)0)
    +#  if defined(__INTEL_COMPILER)
    +#    define XXH_ASSERT(c)   XXH_ASSUME((unsigned char) (c))
    +#  else
    +#    define XXH_ASSERT(c)   XXH_ASSUME(c)
    +#  endif
     #endif
     
     /* note: use after variable declarations */
     #ifndef XXH_STATIC_ASSERT
     #  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)    /* C11 */
    -#    include 
    -#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
    +#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0)
     #  elif defined(__cplusplus) && (__cplusplus >= 201103L)            /* C++11 */
     #    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
     #  else
    @@ -1534,7 +2404,7 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_
      * @brief Used to prevent unwanted optimizations for @p var.
      *
      * It uses an empty GCC inline assembly statement with a register constraint
    - * which forces @p var into a general purpose register (e.g. eax, ebx, ecx
    + * which forces @p var into a general purpose register (eg eax, ebx, ecx
      * on x86) and marks it as modified.
      *
      * This is used in a few places to avoid unwanted autovectorization (e.g.
    @@ -1545,18 +2415,30 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_
      * XXH3_initCustomSecret_scalar().
      */
     #if defined(__GNUC__) || defined(__clang__)
    -#  define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))
    +#  define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var))
     #else
     #  define XXH_COMPILER_GUARD(var) ((void)0)
     #endif
     
    +/* Specifically for NEON vectors which use the "w" constraint, on
    + * Clang. */
    +#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__)
    +#  define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var))
    +#else
    +#  define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0)
    +#endif
    +
     /* *************************************
     *  Basic Types
     ***************************************/
     #if !defined (__VMS) \
      && (defined (__cplusplus) \
      || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
    -# include 
    +# ifdef _AIX
    +#   include 
    +# else
    +#   include 
    +# endif
       typedef uint8_t xxh_u8;
     #else
       typedef unsigned char xxh_u8;
    @@ -1564,6 +2446,7 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return ZSTD_
     typedef XXH32_hash_t xxh_u32;
     
     #ifdef XXH_OLD_NAMES
    +#  warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly"
     #  define BYTE xxh_u8
     #  define U8   xxh_u8
     #  define U32  xxh_u32
    @@ -1637,18 +2520,19 @@ static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr;
     #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
     
     /*
    - * __pack instructions are safer but compiler specific, hence potentially
    - * problematic for some compilers.
    - *
    - * Currently only defined for GCC and ICC.
    + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
    + * documentation claimed that it only increased the alignment, but actually it
    + * can decrease it on gcc, clang, and icc:
    + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
    + * https://gcc.godbolt.org/z/xYez1j67Y.
      */
     #ifdef XXH_OLD_NAMES
     typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
     #endif
     static xxh_u32 XXH_read32(const void* ptr)
     {
    -    typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;
    -    return ((const xxh_unalign*)ptr)->u32;
    +    typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32;
    +    return *((const xxh_unalign32*)ptr);
     }
     
     #else
    @@ -1731,6 +2615,51 @@ static int XXH_isLittleEndian(void)
     #  define XXH_HAS_BUILTIN(x) 0
     #endif
     
    +
    +
    +/*
    + * C23 and future versions have standard "unreachable()".
    + * Once it has been implemented reliably we can add it as an
    + * additional case:
    + *
    + * ```
    + * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN)
    + * #  include 
    + * #  ifdef unreachable
    + * #    define XXH_UNREACHABLE() unreachable()
    + * #  endif
    + * #endif
    + * ```
    + *
    + * Note C++23 also has std::unreachable() which can be detected
    + * as follows:
    + * ```
    + * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L)
    + * #  include 
    + * #  define XXH_UNREACHABLE() std::unreachable()
    + * #endif
    + * ```
    + * NB: `__cpp_lib_unreachable` is defined in the `` header.
    + * We don't use that as including `` in `extern "C"` blocks
    + * doesn't work on GCC12
    + */
    +
    +#if XXH_HAS_BUILTIN(__builtin_unreachable)
    +#  define XXH_UNREACHABLE() __builtin_unreachable()
    +
    +#elif defined(_MSC_VER)
    +#  define XXH_UNREACHABLE() __assume(0)
    +
    +#else
    +#  define XXH_UNREACHABLE()
    +#endif
    +
    +#if XXH_HAS_BUILTIN(__builtin_assume)
    +#  define XXH_ASSUME(c) __builtin_assume(c)
    +#else
    +#  define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }
    +#endif
    +
     /*!
      * @internal
      * @def XXH_rotl32(x,r)
    @@ -1853,8 +2782,10 @@ XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
     *********************************************************************/
     /*!
      * @}
    - * @defgroup xxh32_impl XXH32 implementation
    + * @defgroup XXH32_impl XXH32 implementation
      * @ingroup impl
    + *
    + * Details on the XXH32 implementation.
      * @{
      */
      /* #define instead of static const, to be used as initializers */
    @@ -1888,7 +2819,7 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
         acc += input * XXH_PRIME32_2;
         acc  = XXH_rotl32(acc, 13);
         acc *= XXH_PRIME32_1;
    -#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
    +#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
         /*
          * UGLY HACK:
          * A compiler fence is the only thing that prevents GCC and Clang from
    @@ -1918,9 +2849,12 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
          *   can load data, while v3 can multiply. SSE forces them to operate
          *   together.
          *
    -     * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
    -     * and it is pointless writing a NEON implementation that is basically the
    -     * same speed as scalar for XXH32.
    +     * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing
    +     * the loop. NEON is only faster on the A53, and with the newer cores, it is less
    +     * than half the speed.
    +     *
    +     * Additionally, this is used on WASM SIMD128 because it JITs to the same
    +     * SIMD instructions and has the same issue.
          */
         XXH_COMPILER_GUARD(acc);
     #endif
    @@ -1934,17 +2868,17 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
      * The final mix ensures that all input bits have a chance to impact any bit in
      * the output digest, resulting in an unbiased distribution.
      *
    - * @param h32 The hash to avalanche.
    + * @param hash The hash to avalanche.
      * @return The avalanched hash.
      */
    -static xxh_u32 XXH32_avalanche(xxh_u32 h32)
    +static xxh_u32 XXH32_avalanche(xxh_u32 hash)
     {
    -    h32 ^= h32 >> 15;
    -    h32 *= XXH_PRIME32_2;
    -    h32 ^= h32 >> 13;
    -    h32 *= XXH_PRIME32_3;
    -    h32 ^= h32 >> 16;
    -    return(h32);
    +    hash ^= hash >> 15;
    +    hash *= XXH_PRIME32_2;
    +    hash ^= hash >> 13;
    +    hash *= XXH_PRIME32_3;
    +    hash ^= hash >> 16;
    +    return hash;
     }
     
     #define XXH_get32bits(p) XXH_readLE32_align(p, align)
    @@ -1957,24 +2891,25 @@ static xxh_u32 XXH32_avalanche(xxh_u32 h32)
      * This final stage will digest them to ensure that all input bytes are present
      * in the final mix.
      *
    - * @param h32 The hash to finalize.
    + * @param hash The hash to finalize.
      * @param ptr The pointer to the remaining input.
      * @param len The remaining length, modulo 16.
      * @param align Whether @p ptr is aligned.
      * @return The finalized hash.
    + * @see XXH64_finalize().
      */
    -static xxh_u32
    -XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
    +static XXH_PUREF xxh_u32
    +XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
     {
    -#define XXH_PROCESS1 do {                           \
    -    h32 += (*ptr++) * XXH_PRIME32_5;                \
    -    h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1;      \
    +#define XXH_PROCESS1 do {                             \
    +    hash += (*ptr++) * XXH_PRIME32_5;                 \
    +    hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1;      \
     } while (0)
     
    -#define XXH_PROCESS4 do {                           \
    -    h32 += XXH_get32bits(ptr) * XXH_PRIME32_3;      \
    -    ptr += 4;                                   \
    -    h32  = XXH_rotl32(h32, 17) * XXH_PRIME32_4;     \
    +#define XXH_PROCESS4 do {                             \
    +    hash += XXH_get32bits(ptr) * XXH_PRIME32_3;       \
    +    ptr += 4;                                         \
    +    hash  = XXH_rotl32(hash, 17) * XXH_PRIME32_4;     \
     } while (0)
     
         if (ptr==NULL) XXH_ASSERT(len == 0);
    @@ -1990,49 +2925,49 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
                 XXH_PROCESS1;
                 --len;
             }
    -        return XXH32_avalanche(h32);
    +        return XXH32_avalanche(hash);
         } else {
              switch(len&15) /* or switch(bEnd - p) */ {
                case 12:      XXH_PROCESS4;
    -                         XXH_FALLTHROUGH;
    +                         XXH_FALLTHROUGH;  /* fallthrough */
                case 8:       XXH_PROCESS4;
    -                         XXH_FALLTHROUGH;
    +                         XXH_FALLTHROUGH;  /* fallthrough */
                case 4:       XXH_PROCESS4;
    -                         return XXH32_avalanche(h32);
    +                         return XXH32_avalanche(hash);
     
                case 13:      XXH_PROCESS4;
    -                         XXH_FALLTHROUGH;
    +                         XXH_FALLTHROUGH;  /* fallthrough */
                case 9:       XXH_PROCESS4;
    -                         XXH_FALLTHROUGH;
    +                         XXH_FALLTHROUGH;  /* fallthrough */
                case 5:       XXH_PROCESS4;
                              XXH_PROCESS1;
    -                         return XXH32_avalanche(h32);
    +                         return XXH32_avalanche(hash);
     
                case 14:      XXH_PROCESS4;
    -                         XXH_FALLTHROUGH;
    +                         XXH_FALLTHROUGH;  /* fallthrough */
                case 10:      XXH_PROCESS4;
    -                         XXH_FALLTHROUGH;
    +                         XXH_FALLTHROUGH;  /* fallthrough */
                case 6:       XXH_PROCESS4;
                              XXH_PROCESS1;
                              XXH_PROCESS1;
    -                         return XXH32_avalanche(h32);
    +                         return XXH32_avalanche(hash);
     
                case 15:      XXH_PROCESS4;
    -                         XXH_FALLTHROUGH;
    +                         XXH_FALLTHROUGH;  /* fallthrough */
                case 11:      XXH_PROCESS4;
    -                         XXH_FALLTHROUGH;
    +                         XXH_FALLTHROUGH;  /* fallthrough */
                case 7:       XXH_PROCESS4;
    -                         XXH_FALLTHROUGH;
    +                         XXH_FALLTHROUGH;  /* fallthrough */
                case 3:       XXH_PROCESS1;
    -                         XXH_FALLTHROUGH;
    +                         XXH_FALLTHROUGH;  /* fallthrough */
                case 2:       XXH_PROCESS1;
    -                         XXH_FALLTHROUGH;
    +                         XXH_FALLTHROUGH;  /* fallthrough */
                case 1:       XXH_PROCESS1;
    -                         XXH_FALLTHROUGH;
    -           case 0:       return XXH32_avalanche(h32);
    +                         XXH_FALLTHROUGH;  /* fallthrough */
    +           case 0:       return XXH32_avalanche(hash);
             }
             XXH_ASSERT(0);
    -        return h32;   /* reaching this point is deemed impossible */
    +        return hash;   /* reaching this point is deemed impossible */
         }
     }
     
    @@ -2052,7 +2987,7 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
      * @param align Whether @p input is aligned.
      * @return The calculated hash.
      */
    -XXH_FORCE_INLINE xxh_u32
    +XXH_FORCE_INLINE XXH_PUREF xxh_u32
     XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
     {
         xxh_u32 h32;
    @@ -2085,10 +3020,10 @@ XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment
         return XXH32_finalize(h32, input, len&15, align);
     }
     
    -/*! @ingroup xxh32_family */
    +/*! @ingroup XXH32_family */
     XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
     {
    -#if 0
    +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
         /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
         XXH32_state_t state;
         XXH32_reset(&state, seed);
    @@ -2107,27 +3042,26 @@ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t s
     
     
     /*******   Hash streaming   *******/
    -/*!
    - * @ingroup xxh32_family
    - */
    +#ifndef XXH_NO_STREAM
    +/*! @ingroup XXH32_family */
     XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
     {
         return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
     }
    -/*! @ingroup xxh32_family */
    +/*! @ingroup XXH32_family */
     XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
     {
         XXH_free(statePtr);
         return XXH_OK;
     }
     
    -/*! @ingroup xxh32_family */
    +/*! @ingroup XXH32_family */
     XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
     {
         XXH_memcpy(dstState, srcState, sizeof(*dstState));
     }
     
    -/*! @ingroup xxh32_family */
    +/*! @ingroup XXH32_family */
     XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
     {
         XXH_ASSERT(statePtr != NULL);
    @@ -2140,7 +3074,7 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t s
     }
     
     
    -/*! @ingroup xxh32_family */
    +/*! @ingroup XXH32_family */
     XXH_PUBLIC_API XXH_errorcode
     XXH32_update(XXH32_state_t* state, const void* input, size_t len)
     {
    @@ -2195,7 +3129,7 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
     }
     
     
    -/*! @ingroup xxh32_family */
    +/*! @ingroup XXH32_family */
     XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
     {
         xxh_u32 h32;
    @@ -2213,31 +3147,18 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
     
         return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
     }
    -
    +#endif /* !XXH_NO_STREAM */
     
     /*******   Canonical representation   *******/
     
    -/*!
    - * @ingroup xxh32_family
    - * The default return values from XXH functions are unsigned 32 and 64 bit
    - * integers.
    - *
    - * The canonical representation uses big endian convention, the same convention
    - * as human-readable numbers (large digits first).
    - *
    - * This way, hash values can be written into a file or buffer, remaining
    - * comparable across different systems.
    - *
    - * The following functions allow transformation of hash values to and from their
    - * canonical format.
    - */
    +/*! @ingroup XXH32_family */
     XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
     {
    -    /* XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); */
    +    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
         if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
         XXH_memcpy(dst, &hash, sizeof(*dst));
     }
    -/*! @ingroup xxh32_family */
    +/*! @ingroup XXH32_family */
     XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
     {
         return XXH_readBE32(src);
    @@ -2278,18 +3199,19 @@ static xxh_u64 XXH_read64(const void* memPtr)
     #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
     
     /*
    - * __pack instructions are safer, but compiler specific, hence potentially
    - * problematic for some compilers.
    - *
    - * Currently only defined for GCC and ICC.
    + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
    + * documentation claimed that it only increased the alignment, but actually it
    + * can decrease it on gcc, clang, and icc:
    + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
    + * https://gcc.godbolt.org/z/xYez1j67Y.
      */
     #ifdef XXH_OLD_NAMES
     typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
     #endif
     static xxh_u64 XXH_read64(const void* ptr)
     {
    -    typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;
    -    return ((const xxh_unalign64*)ptr)->u64;
    +    typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64;
    +    return *((const xxh_unalign64*)ptr);
     }
     
     #else
    @@ -2380,8 +3302,10 @@ XXH_readLE64_align(const void* ptr, XXH_alignment align)
     /*******   xxh64   *******/
     /*!
      * @}
    - * @defgroup xxh64_impl XXH64 implementation
    + * @defgroup XXH64_impl XXH64 implementation
      * @ingroup impl
    + *
    + * Details on the XXH64 implementation.
      * @{
      */
     /* #define rather that static const, to be used as initializers */
    @@ -2399,11 +3323,29 @@ XXH_readLE64_align(const void* ptr, XXH_alignment align)
     #  define PRIME64_5 XXH_PRIME64_5
     #endif
     
    +/*! @copydoc XXH32_round */
     static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
     {
         acc += input * XXH_PRIME64_2;
         acc  = XXH_rotl64(acc, 31);
         acc *= XXH_PRIME64_1;
    +#if (defined(__AVX512F__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
    +    /*
    +     * DISABLE AUTOVECTORIZATION:
    +     * A compiler fence is used to prevent GCC and Clang from
    +     * autovectorizing the XXH64 loop (pragmas and attributes don't work for some
    +     * reason) without globally disabling AVX512.
    +     *
    +     * Autovectorization of XXH64 tends to be detrimental,
    +     * though the exact outcome may change depending on exact cpu and compiler version.
    +     * For information, it has been reported as detrimental for Skylake-X,
    +     * but possibly beneficial for Zen4.
    +     *
    +     * The default is to disable auto-vectorization,
    +     * but you can select to enable it instead using `XXH_ENABLE_AUTOVECTORIZE` build variable.
    +     */
    +    XXH_COMPILER_GUARD(acc);
    +#endif
         return acc;
     }
     
    @@ -2415,43 +3357,59 @@ static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
         return acc;
     }
     
    -static xxh_u64 XXH64_avalanche(xxh_u64 h64)
    +/*! @copydoc XXH32_avalanche */
    +static xxh_u64 XXH64_avalanche(xxh_u64 hash)
     {
    -    h64 ^= h64 >> 33;
    -    h64 *= XXH_PRIME64_2;
    -    h64 ^= h64 >> 29;
    -    h64 *= XXH_PRIME64_3;
    -    h64 ^= h64 >> 32;
    -    return h64;
    +    hash ^= hash >> 33;
    +    hash *= XXH_PRIME64_2;
    +    hash ^= hash >> 29;
    +    hash *= XXH_PRIME64_3;
    +    hash ^= hash >> 32;
    +    return hash;
     }
     
     
     #define XXH_get64bits(p) XXH_readLE64_align(p, align)
     
    -static xxh_u64
    -XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
    +/*!
    + * @internal
    + * @brief Processes the last 0-31 bytes of @p ptr.
    + *
    + * There may be up to 31 bytes remaining to consume from the input.
    + * This final stage will digest them to ensure that all input bytes are present
    + * in the final mix.
    + *
    + * @param hash The hash to finalize.
    + * @param ptr The pointer to the remaining input.
    + * @param len The remaining length, modulo 32.
    + * @param align Whether @p ptr is aligned.
    + * @return The finalized hash
    + * @see XXH32_finalize().
    + */
    +static XXH_PUREF xxh_u64
    +XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
     {
         if (ptr==NULL) XXH_ASSERT(len == 0);
         len &= 31;
         while (len >= 8) {
             xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
             ptr += 8;
    -        h64 ^= k1;
    -        h64  = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
    +        hash ^= k1;
    +        hash  = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
             len -= 8;
         }
         if (len >= 4) {
    -        h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
    +        hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
             ptr += 4;
    -        h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
    +        hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
             len -= 4;
         }
         while (len > 0) {
    -        h64 ^= (*ptr++) * XXH_PRIME64_5;
    -        h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
    +        hash ^= (*ptr++) * XXH_PRIME64_5;
    +        hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1;
             --len;
         }
    -    return  XXH64_avalanche(h64);
    +    return  XXH64_avalanche(hash);
     }
     
     #ifdef XXH_OLD_NAMES
    @@ -2464,7 +3422,15 @@ XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
     #  undef XXH_PROCESS8_64
     #endif
     
    -XXH_FORCE_INLINE xxh_u64
    +/*!
    + * @internal
    + * @brief The implementation for @ref XXH64().
    + *
    + * @param input , len , seed Directly passed from @ref XXH64().
    + * @param align Whether @p input is aligned.
    + * @return The calculated hash.
    + */
    +XXH_FORCE_INLINE XXH_PUREF xxh_u64
     XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
     {
         xxh_u64 h64;
    @@ -2501,10 +3467,10 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment
     }
     
     
    -/*! @ingroup xxh64_family */
    -XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
    +/*! @ingroup XXH64_family */
    +XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
     {
    -#if 0
    +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
         /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
         XXH64_state_t state;
         XXH64_reset(&state, seed);
    @@ -2522,27 +3488,27 @@ XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t s
     }
     
     /*******   Hash Streaming   *******/
    -
    -/*! @ingroup xxh64_family*/
    +#ifndef XXH_NO_STREAM
    +/*! @ingroup XXH64_family*/
     XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
     {
         return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
     }
    -/*! @ingroup xxh64_family */
    +/*! @ingroup XXH64_family */
     XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
     {
         XXH_free(statePtr);
         return XXH_OK;
     }
     
    -/*! @ingroup xxh64_family */
    -XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
    +/*! @ingroup XXH64_family */
    +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)
     {
         XXH_memcpy(dstState, srcState, sizeof(*dstState));
     }
     
    -/*! @ingroup xxh64_family */
    -XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
    +/*! @ingroup XXH64_family */
    +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)
     {
         XXH_ASSERT(statePtr != NULL);
         memset(statePtr, 0, sizeof(*statePtr));
    @@ -2553,9 +3519,9 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t s
         return XXH_OK;
     }
     
    -/*! @ingroup xxh64_family */
    +/*! @ingroup XXH64_family */
     XXH_PUBLIC_API XXH_errorcode
    -XXH64_update (XXH64_state_t* state, const void* input, size_t len)
    +XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)
     {
         if (input==NULL) {
             XXH_ASSERT(len == 0);
    @@ -2605,8 +3571,8 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
     }
     
     
    -/*! @ingroup xxh64_family */
    -XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
    +/*! @ingroup XXH64_family */
    +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)
     {
         xxh_u64 h64;
     
    @@ -2624,20 +3590,20 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
     
         return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
     }
    -
    +#endif /* !XXH_NO_STREAM */
     
     /******* Canonical representation   *******/
     
    -/*! @ingroup xxh64_family */
    -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
    +/*! @ingroup XXH64_family */
    +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)
     {
    -    /* XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); */
    +    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
         if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
         XXH_memcpy(dst, &hash, sizeof(*dst));
     }
     
    -/*! @ingroup xxh64_family */
    -XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
    +/*! @ingroup XXH64_family */
    +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)
     {
         return XXH_readBE64(src);
     }
    @@ -2650,7 +3616,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
     ************************************************************************ */
     /*!
      * @}
    - * @defgroup xxh3_impl XXH3 implementation
    + * @defgroup XXH3_impl XXH3 implementation
      * @ingroup impl
      * @{
      */
    @@ -2658,11 +3624,19 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
     /* ===   Compiler specifics   === */
     
     #if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
    -#  define XXH_RESTRICT /* disable */
    +#  define XXH_RESTRICT   /* disable */
     #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
     #  define XXH_RESTRICT   restrict
    +#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
    +   || (defined (__clang__)) \
    +   || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \
    +   || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300))
    +/*
    + * There are a LOT more compilers that recognize __restrict but this
    + * covers the major ones.
    + */
    +#  define XXH_RESTRICT   __restrict
     #else
    -/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
     #  define XXH_RESTRICT   /* disable */
     #endif
     
    @@ -2676,10 +3650,26 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
     #    define XXH_unlikely(x) (x)
     #endif
     
    +#ifndef XXH_HAS_INCLUDE
    +#  ifdef __has_include
    +/*
    + * Not defined as XXH_HAS_INCLUDE(x) (function-like) because
    + * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion)
    + */
    +#    define XXH_HAS_INCLUDE __has_include
    +#  else
    +#    define XXH_HAS_INCLUDE(x) 0
    +#  endif
    +#endif
    +
     #if defined(__GNUC__) || defined(__clang__)
    +#  if defined(__ARM_FEATURE_SVE)
    +#    include 
    +#  endif
     #  if defined(__ARM_NEON__) || defined(__ARM_NEON) \
    -   || defined(__aarch64__)  || defined(_M_ARM) \
    -   || defined(_M_ARM64)     || defined(_M_ARM64EC)
    +   || (defined(_M_ARM) && _M_ARM >= 7) \
    +   || defined(_M_ARM64) || defined(_M_ARM64EC) \
    +   || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* WASM SIMD128 via SIMDe */
     #    define inline __inline__  /* circumvent a clang bug */
     #    include 
     #    undef inline
    @@ -2790,7 +3780,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
      * Note that these are actually implemented as macros.
      *
      * If this is not defined, it is detected automatically.
    - * @ref XXH_X86DISPATCH overrides this.
    + * internal macro XXH_X86DISPATCH overrides this.
      */
     enum XXH_VECTOR_TYPE /* fake enum */ {
         XXH_SCALAR = 0,  /*!< Portable scalar version */
    @@ -2802,8 +3792,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
                           */
         XXH_AVX2   = 2,  /*!< AVX2 for Haswell and Bulldozer */
         XXH_AVX512 = 3,  /*!< AVX512 for Skylake and Icelake */
    -    XXH_NEON   = 4,  /*!< NEON for most ARMv7-A and all AArch64 */
    +    XXH_NEON   = 4,  /*!<
    +                       * NEON for most ARMv7-A, all AArch64, and WASM SIMD128
    +                       * via the SIMDeverywhere polyfill provided with the
    +                       * Emscripten SDK.
    +                       */
         XXH_VSX    = 5,  /*!< VSX and ZVector for POWER8/z13 (64-bit) */
    +    XXH_SVE    = 6,  /*!< SVE for some ARMv8-A and ARMv9-A */
     };
     /*!
      * @ingroup tuning
    @@ -2825,12 +3820,16 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
     #  define XXH_AVX512 3
     #  define XXH_NEON   4
     #  define XXH_VSX    5
    +#  define XXH_SVE    6
     #endif
     
     #ifndef XXH_VECTOR    /* can be defined on command line */
    -#  if ( \
    +#  if defined(__ARM_FEATURE_SVE)
    +#    define XXH_VECTOR XXH_SVE
    +#  elif ( \
             defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
          || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
    +     || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* wasm simd128 via SIMDe */ \
        ) && ( \
             defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
         || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
    @@ -2851,6 +3850,17 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
     #  endif
     #endif
     
    +/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
    +#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
    +#  ifdef _MSC_VER
    +#    pragma warning(once : 4606)
    +#  else
    +#    warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
    +#  endif
    +#  undef XXH_VECTOR
    +#  define XXH_VECTOR XXH_SCALAR
    +#endif
    +
     /*
      * Controls the alignment of the accumulator,
      * for compatibility with aligned vector loads, which are usually faster.
    @@ -2870,16 +3880,26 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
     #     define XXH_ACC_ALIGN 16
     #  elif XXH_VECTOR == XXH_AVX512  /* avx512 */
     #     define XXH_ACC_ALIGN 64
    +#  elif XXH_VECTOR == XXH_SVE   /* sve */
    +#     define XXH_ACC_ALIGN 64
     #  endif
     #endif
     
     #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
         || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
     #  define XXH_SEC_ALIGN XXH_ACC_ALIGN
    +#elif XXH_VECTOR == XXH_SVE
    +#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
     #else
     #  define XXH_SEC_ALIGN 8
     #endif
     
    +#if defined(__GNUC__) || defined(__clang__)
    +#  define XXH_ALIASING __attribute__((may_alias))
    +#else
    +#  define XXH_ALIASING /* nothing */
    +#endif
    +
     /*
      * UGLY HACK:
      * GCC usually generates the best code with -O3 for xxHash.
    @@ -2892,164 +3912,137 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
      * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
      *
      * That is why when compiling the AVX2 version, it is recommended to use either
    - *   -O2 -mavx2 -march=haswell
    - * or
    - *   -O2 -mavx2 -mno-avx256-split-unaligned-load
    - * for decent performance, or to use Clang instead.
    - *
    - * Fortunately, we can control the first one with a pragma that forces GCC into
    - * -O2, but the other one we can't control without "failed to inline always
    - * inline function due to target mismatch" warnings.
    - */
    -#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
    -  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
    -  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
    -#  pragma GCC push_options
    -#  pragma GCC optimize("-O2")
    -#endif
    -
    -
    -#if XXH_VECTOR == XXH_NEON
    -/*
    - * NEON's setup for vmlal_u32 is a little more complicated than it is on
    - * SSE2, AVX2, and VSX.
    - *
    - * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
    - *
    - * To do the same operation, the 128-bit 'Q' register needs to be split into
    - * two 64-bit 'D' registers, performing this operation::
    - *
    - *   [                a                 |                 b                ]
    - *            |              '---------. .--------'                |
    - *            |                         x                          |
    - *            |              .---------' '--------.                |
    - *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
    - *
    - * Due to significant changes in aarch64, the fastest method for aarch64 is
    - * completely different than the fastest method for ARMv7-A.
    - *
    - * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
    - * D11 will modify the high half of Q5. This is similar to how modifying AH
    - * will only affect bits 8-15 of AX on x86.
    - *
    - * VZIP takes two registers, and puts even lanes in one register and odd lanes
    - * in the other.
    - *
    - * On ARMv7-A, this strangely modifies both parameters in place instead of
    - * taking the usual 3-operand form.
    - *
    - * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
    - * lower and upper halves of the Q register to end up with the high and low
    - * halves where we want - all in one instruction.
    - *
    - *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
    - *
    - * Unfortunately we need inline assembly for this: Instructions modifying two
    - * registers at once is not possible in GCC or Clang's IR, and they have to
    - * create a copy.
    - *
    - * aarch64 requires a different approach.
    - *
    - * In order to make it easier to write a decent compiler for aarch64, many
    - * quirks were removed, such as conditional execution.
    - *
    - * NEON was also affected by this.
    - *
    - * aarch64 cannot access the high bits of a Q-form register, and writes to a
    - * D-form register zero the high bits, similar to how writes to W-form scalar
    - * registers (or DWORD registers on x86_64) work.
    - *
    - * The formerly free vget_high intrinsics now require a vext (with a few
    - * exceptions)
    - *
    - * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
    - * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
    - * operand.
    + *   -O2 -mavx2 -march=haswell
    + * or
    + *   -O2 -mavx2 -mno-avx256-split-unaligned-load
    + * for decent performance, or to use Clang instead.
      *
    - * The equivalent of the VZIP.32 on the lower and upper halves would be this
    - * mess:
    + * Fortunately, we can control the first one with a pragma that forces GCC into
    + * -O2, but the other one we can't control without "failed to inline always
    + * inline function due to target mismatch" warnings.
    + */
    +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
    +  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
    +  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
    +#  pragma GCC push_options
    +#  pragma GCC optimize("-O2")
    +#endif
    +
    +#if XXH_VECTOR == XXH_NEON
    +
    +/*
    + * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3
    + * optimizes out the entire hashLong loop because of the aliasing violation.
      *
    - *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
    - *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
    - *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
    + * However, GCC is also inefficient at load-store optimization with vld1q/vst1q,
    + * so the only option is to mark it as aliasing.
    + */
    +typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
    +
    +/*!
    + * @internal
    + * @brief `vld1q_u64` but faster and alignment-safe.
      *
    - * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
    + * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only
    + * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86).
      *
    - *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
    - *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
    + * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it
    + * prohibits load-store optimizations. Therefore, a direct dereference is used.
      *
    - * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
    + * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe
    + * unaligned load.
      */
    +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
    +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
    +{
    +    return *(xxh_aliasing_uint64x2_t const *)ptr;
    +}
    +#else
    +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
    +{
    +    return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
    +}
    +#endif
     
     /*!
    - * Function-like macro:
    - * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
    - * {
    - *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
    - *     outHi = (uint32x2_t)(in >> 32);
    - *     in = UNDEFINED;
    - * }
    + * @internal
    + * @brief `vmlal_u32` on low and high halves of a vector.
    + *
    + * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with
    + * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32`
    + * with `vmlal_u32`.
      */
    -# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
    -   && (defined(__GNUC__) || defined(__clang__)) \
    -   && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))
    -#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
    -    do {                                                                                    \
    -      /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
    -      /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */     \
    -      /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
    -      __asm__("vzip.32  %e0, %f0" : "+w" (in));                                             \
    -      (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in));                                   \
    -      (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                   \
    -   } while (0)
    -# else
    -#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                            \
    -    do {                                                                                  \
    -      (outLo) = vmovn_u64    (in);                                                        \
    -      (outHi) = vshrn_n_u64  ((in), 32);                                                  \
    -    } while (0)
    -# endif
    +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11
    +XXH_FORCE_INLINE uint64x2_t
    +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
    +{
    +    /* Inline assembly is the only way */
    +    __asm__("umlal   %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs));
    +    return acc;
    +}
    +XXH_FORCE_INLINE uint64x2_t
    +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
    +{
    +    /* This intrinsic works as expected */
    +    return vmlal_high_u32(acc, lhs, rhs);
    +}
    +#else
    +/* Portable intrinsic versions */
    +XXH_FORCE_INLINE uint64x2_t
    +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
    +{
    +    return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs));
    +}
    +/*! @copydoc XXH_vmlal_low_u32
    + * Assume the compiler converts this to vmlal_high_u32 on aarch64 */
    +XXH_FORCE_INLINE uint64x2_t
    +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
    +{
    +    return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs));
    +}
    +#endif
     
     /*!
      * @ingroup tuning
      * @brief Controls the NEON to scalar ratio for XXH3
      *
    - * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and
    - * 2 lanes on scalar by default.
    + * This can be set to 2, 4, 6, or 8.
      *
    - * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the
    - * emulated 64-bit arithmetic is too slow.
    + * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used.
      *
    - * Modern ARM CPUs are _very_ sensitive to how their pipelines are used.
    + * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those
    + * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU
    + * bandwidth.
      *
    - * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't
    - * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,
    - * you are only using 2/3 of the CPU bandwidth.
    - *
    - * This is even more noticeable on the more advanced cores like the A76 which
    + * This is even more noticeable on the more advanced cores like the Cortex-A76 which
      * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
      *
    - * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the
    - * remaining lanes will use scalar instructions. This improves the bandwidth
    - * and also gives the integer pipelines something to do besides twiddling loop
    - * counters and pointers.
    + * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes
    + * and 2 scalar lanes, which is chosen by default.
    + *
    + * This does not apply to Apple processors or 32-bit processors, which run better with
    + * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes.
      *
      * This change benefits CPUs with large micro-op buffers without negatively affecting
    - * other CPUs:
    + * most other CPUs:
      *
      *  | Chipset               | Dispatch type       | NEON only | 6:2 hybrid | Diff. |
      *  |:----------------------|:--------------------|----------:|-----------:|------:|
      *  | Snapdragon 730 (A76)  | 2 NEON/8 micro-ops  |  8.8 GB/s |  10.1 GB/s |  ~16% |
      *  | Snapdragon 835 (A73)  | 2 NEON/3 micro-ops  |  5.1 GB/s |   5.3 GB/s |   ~5% |
      *  | Marvell PXA1928 (A53) | In-order dual-issue |  1.9 GB/s |   1.9 GB/s |    0% |
    + *  | Apple M1              | 4 NEON/8 micro-ops  | 37.3 GB/s |  36.1 GB/s |  ~-3% |
      *
      * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
      *
    + * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning
    + * it effectively becomes worse 4.
    + *
      * @see XXH3_accumulate_512_neon()
      */
     # ifndef XXH3_NEON_LANES
     #  if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
    -   && !defined(__OPTIMIZE_SIZE__)
    +   && !defined(__APPLE__) && XXH_SIZE_OPT <= 0
     #   define XXH3_NEON_LANES 6
     #  else
     #   define XXH3_NEON_LANES XXH_ACC_NB
    @@ -3066,27 +4059,42 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
      * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
      */
     #if XXH_VECTOR == XXH_VSX
    +/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`,
    + * and `pixel`. This is a problem for obvious reasons.
    + *
    + * These keywords are unnecessary; the spec literally says they are
    + * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd
    + * after including the header.
    + *
    + * We use pragma push_macro/pop_macro to keep the namespace clean. */
    +#  pragma push_macro("bool")
    +#  pragma push_macro("vector")
    +#  pragma push_macro("pixel")
    +/* silence potential macro redefined warnings */
    +#  undef bool
    +#  undef vector
    +#  undef pixel
    +
     #  if defined(__s390x__)
     #    include 
     #  else
    -/* gcc's altivec.h can have the unwanted consequence to unconditionally
    - * #define bool, vector, and pixel keywords,
    - * with bad consequences for programs already using these keywords for other purposes.
    - * The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined.
    - * __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler,
    - * but it seems that, in some cases, it isn't.
    - * Force the build macro to be defined, so that keywords are not altered.
    - */
    -#    if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
    -#      define __APPLE_ALTIVEC__
    -#    endif
     #    include 
     #  endif
     
    +/* Restore the original macro values, if applicable. */
    +#  pragma pop_macro("pixel")
    +#  pragma pop_macro("vector")
    +#  pragma pop_macro("bool")
    +
     typedef __vector unsigned long long xxh_u64x2;
     typedef __vector unsigned char xxh_u8x16;
     typedef __vector unsigned xxh_u32x4;
     
    +/*
    + * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue.
    + */
    +typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING;
    +
     # ifndef XXH_VSX_BE
     #  if defined(__BIG_ENDIAN__) \
       || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
    @@ -3138,8 +4146,9 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
      /* s390x is always big endian, no issue on this platform */
     #  define XXH_vec_mulo vec_mulo
     #  define XXH_vec_mule vec_mule
    -# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
    +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__)
     /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
    + /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */
     #  define XXH_vec_mulo __builtin_altivec_vmulouw
     #  define XXH_vec_mule __builtin_altivec_vmuleuw
     # else
    @@ -3160,13 +4169,28 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
     # endif /* XXH_vec_mulo, XXH_vec_mule */
     #endif /* XXH_VECTOR == XXH_VSX */
     
    +#if XXH_VECTOR == XXH_SVE
    +#define ACCRND(acc, offset) \
    +do { \
    +    svuint64_t input_vec = svld1_u64(mask, xinput + offset);         \
    +    svuint64_t secret_vec = svld1_u64(mask, xsecret + offset);       \
    +    svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec);     \
    +    svuint64_t swapped = svtbl_u64(input_vec, kSwap);                \
    +    svuint64_t mixed_lo = svextw_u64_x(mask, mixed);                 \
    +    svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32);            \
    +    svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
    +    acc = svadd_u64_x(mask, acc, mul);                               \
    +} while (0)
    +#endif /* XXH_VECTOR == XXH_SVE */
     
     /* prefetch
      * can be disabled, by declaring XXH_NO_PREFETCH build macro */
     #if defined(XXH_NO_PREFETCH)
     #  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
     #else
    -#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
    +#  if XXH_SIZE_OPT >= 1
    +#    define XXH_PREFETCH(ptr) (void)(ptr)
    +#  elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
     #    include    /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
     #    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
     #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
    @@ -3203,6 +4227,8 @@ XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
         0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
     };
     
    +static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL;  /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */
    +static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL;  /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */
     
     #ifdef XXH_OLD_NAMES
     #  define kSecret XXH3_kSecret
    @@ -3394,7 +4420,7 @@ XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
     }
     
     /*! Seems to produce slightly better code on GCC for some reason. */
    -XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
    +XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
     {
         XXH_ASSERT(0 <= shift && shift < 64);
         return v64 ^ (v64 >> shift);
    @@ -3407,7 +4433,7 @@ XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
     static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
     {
         h64 = XXH_xorshift64(h64, 37);
    -    h64 *= 0x165667919E3779F9ULL;
    +    h64 *= PRIME_MX1;
         h64 = XXH_xorshift64(h64, 32);
         return h64;
     }
    @@ -3421,9 +4447,9 @@ static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
     {
         /* this mix is inspired by Pelle Evensen's rrmxmx */
         h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
    -    h64 *= 0x9FB21C651E98DF25ULL;
    +    h64 *= PRIME_MX2;
         h64 ^= (h64 >> 35) + len ;
    -    h64 *= 0x9FB21C651E98DF25ULL;
    +    h64 *= PRIME_MX2;
         return XXH_xorshift64(h64, 28);
     }
     
    @@ -3461,7 +4487,7 @@ static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
      *
      * This adds an extra layer of strength for custom secrets.
      */
    -XXH_FORCE_INLINE XXH64_hash_t
    +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
     XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
     {
         XXH_ASSERT(input != NULL);
    @@ -3483,7 +4509,7 @@ XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_h
         }
     }
     
    -XXH_FORCE_INLINE XXH64_hash_t
    +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
     XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
     {
         XXH_ASSERT(input != NULL);
    @@ -3499,7 +4525,7 @@ XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_h
         }
     }
     
    -XXH_FORCE_INLINE XXH64_hash_t
    +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
     XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
     {
         XXH_ASSERT(input != NULL);
    @@ -3516,7 +4542,7 @@ XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_
         }
     }
     
    -XXH_FORCE_INLINE XXH64_hash_t
    +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
     XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
     {
         XXH_ASSERT(len <= 16);
    @@ -3586,7 +4612,7 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
     }
     
     /* For mid range keys, XXH3 uses a Mum-hash variant. */
    -XXH_FORCE_INLINE XXH64_hash_t
    +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
     XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
                          const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
                          XXH64_hash_t seed)
    @@ -3595,6 +4621,14 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
         XXH_ASSERT(16 < len && len <= 128);
     
         {   xxh_u64 acc = len * XXH_PRIME64_1;
    +#if XXH_SIZE_OPT >= 1
    +        /* Smaller and cleaner, but slightly slower. */
    +        unsigned int i = (unsigned int)(len - 1) / 32;
    +        do {
    +            acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
    +            acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
    +        } while (i-- != 0);
    +#else
             if (len > 32) {
                 if (len > 64) {
                     if (len > 96) {
    @@ -3609,14 +4643,17 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
             }
             acc += XXH3_mix16B(input+0, secret+0, seed);
             acc += XXH3_mix16B(input+len-16, secret+16, seed);
    -
    +#endif
             return XXH3_avalanche(acc);
         }
     }
     
    +/*!
    + * @brief Maximum size of "short" key in bytes.
    + */
     #define XXH3_MIDSIZE_MAX 240
     
    -XXH_NO_INLINE XXH64_hash_t
    +XXH_NO_INLINE XXH_PUREF XXH64_hash_t
     XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
                           const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
                           XXH64_hash_t seed)
    @@ -3628,13 +4665,17 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
         #define XXH3_MIDSIZE_LASTOFFSET  17
     
         {   xxh_u64 acc = len * XXH_PRIME64_1;
    -        int const nbRounds = (int)len / 16;
    -        int i;
    +        xxh_u64 acc_end;
    +        unsigned int const nbRounds = (unsigned int)len / 16;
    +        unsigned int i;
    +        XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
             for (i=0; i<8; i++) {
                 acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
             }
    -        acc = XXH3_avalanche(acc);
    +        /* last bytes */
    +        acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
             XXH_ASSERT(nbRounds >= 8);
    +        acc = XXH3_avalanche(acc);
     #if defined(__clang__)                                /* Clang */ \
         && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
         && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
    @@ -3661,11 +4702,13 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
             #pragma clang loop vectorize(disable)
     #endif
             for (i=8 ; i < nbRounds; i++) {
    -            acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
    +            /*
    +             * Prevents clang for unrolling the acc loop and interleaving with this one.
    +             */
    +            XXH_COMPILER_GUARD(acc);
    +            acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
             }
    -        /* last bytes */
    -        acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
    -        return XXH3_avalanche(acc);
    +        return XXH3_avalanche(acc + acc_end);
         }
     }
     
    @@ -3681,6 +4724,47 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
     #  define ACC_NB XXH_ACC_NB
     #endif
     
    +#ifndef XXH_PREFETCH_DIST
    +#  ifdef __clang__
    +#    define XXH_PREFETCH_DIST 320
    +#  else
    +#    if (XXH_VECTOR == XXH_AVX512)
    +#      define XXH_PREFETCH_DIST 512
    +#    else
    +#      define XXH_PREFETCH_DIST 384
    +#    endif
    +#  endif  /* __clang__ */
    +#endif  /* XXH_PREFETCH_DIST */
    +
    +/*
    + * These macros are to generate an XXH3_accumulate() function.
    + * The two arguments select the name suffix and target attribute.
    + *
    + * The name of this symbol is XXH3_accumulate_() and it calls
    + * XXH3_accumulate_512_().
    + *
    + * It may be useful to hand implement this function if the compiler fails to
    + * optimize the inline function.
    + */
    +#define XXH3_ACCUMULATE_TEMPLATE(name)                      \
    +void                                                        \
    +XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc,           \
    +                       const xxh_u8* XXH_RESTRICT input,    \
    +                       const xxh_u8* XXH_RESTRICT secret,   \
    +                       size_t nbStripes)                    \
    +{                                                           \
    +    size_t n;                                               \
    +    for (n = 0; n < nbStripes; n++ ) {                      \
    +        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;  \
    +        XXH_PREFETCH(in + XXH_PREFETCH_DIST);               \
    +        XXH3_accumulate_512_##name(                         \
    +                 acc,                                       \
    +                 in,                                        \
    +                 secret + n*XXH_SECRET_CONSUME_RATE);       \
    +    }                                                       \
    +}
    +
    +
     XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
     {
         if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
    @@ -3749,7 +4833,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
             /* data_key    = data_vec ^ key_vec; */
             __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
             /* data_key_lo = data_key >> 32; */
    -        __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
    +        __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
             /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
             __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
             /* xacc[0] += swap(data_vec); */
    @@ -3759,6 +4843,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
             *xacc = _mm512_add_epi64(product, sum);
         }
     }
    +XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)
     
     /*
      * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
    @@ -3792,13 +4877,12 @@ XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
             /* xacc[0] ^= (xacc[0] >> 47) */
             __m512i const acc_vec     = *xacc;
             __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
    -        __m512i const data_vec    = _mm512_xor_si512     (acc_vec, shifted);
             /* xacc[0] ^= secret; */
             __m512i const key_vec     = _mm512_loadu_si512   (secret);
    -        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
    +        __m512i const data_key    = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);
     
             /* xacc[0] *= XXH_PRIME32_1; */
    -        __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
    +        __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
             __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
             __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
             *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
    @@ -3813,7 +4897,8 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
         XXH_ASSERT(((size_t)customSecret & 63) == 0);
         (void)(&XXH_writeLE64);
         {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
    -        __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
    +        __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
    +        __m512i const seed     = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);
     
             const __m512i* const src  = (const __m512i*) ((const void*) XXH3_kSecret);
                   __m512i* const dest = (      __m512i*) customSecret;
    @@ -3821,14 +4906,7 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
             XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
             XXH_ASSERT(((size_t)dest & 63) == 0);
             for (i=0; i < nbRounds; ++i) {
    -            /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',
    -             * this will warn "discards 'const' qualifier". */
    -            union {
    -                const __m512i* cp;
    -                void* p;
    -            } remote_const_void;
    -            remote_const_void.cp = src + i;
    -            dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
    +            dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
         }   }
     }
     
    @@ -3864,7 +4942,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
                 /* data_key    = data_vec ^ key_vec; */
                 __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
                 /* data_key_lo = data_key >> 32; */
    -            __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
    +            __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
                 /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
                 __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
                 /* xacc[i] += swap(data_vec); */
    @@ -3874,6 +4952,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
                 xacc[i] = _mm256_add_epi64(product, sum);
         }   }
     }
    +XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)
     
     XXH_FORCE_INLINE XXH_TARGET_AVX2 void
     XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
    @@ -3896,7 +4975,7 @@ XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
                 __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
     
                 /* xacc[i] *= XXH_PRIME32_1; */
    -            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
    +            __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
                 __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
                 __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
                 xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
    @@ -3928,12 +5007,12 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTR
             XXH_ASSERT(((size_t)dest & 31) == 0);
     
             /* GCC -O2 need unroll loop manually */
    -        dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
    -        dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);
    -        dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);
    -        dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);
    -        dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);
    -        dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);
    +        dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
    +        dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
    +        dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
    +        dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
    +        dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
    +        dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
         }
     }
     
    @@ -3980,6 +5059,7 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
                 xacc[i] = _mm_add_epi64(product, sum);
         }   }
     }
    +XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)
     
     XXH_FORCE_INLINE XXH_TARGET_SSE2 void
     XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
    @@ -4058,14 +5138,28 @@ XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
     
     /*!
      * @internal
    - * @brief The bulk processing loop for NEON.
    + * @brief The bulk processing loop for NEON and WASM SIMD128.
      *
      * The NEON code path is actually partially scalar when running on AArch64. This
      * is to optimize the pipelining and can have up to 15% speedup depending on the
      * CPU, and it also mitigates some GCC codegen issues.
      *
      * @see XXH3_NEON_LANES for configuring this and details about this optimization.
    + *
    + * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit
    + * integers instead of the other platforms which mask full 64-bit vectors,
    + * so the setup is more complicated than just shifting right.
    + *
    + * Additionally, there is an optimization for 4 lanes at once noted below.
    + *
    + * Since, as stated, the most optimal amount of lanes for Cortexes is 6,
    + * there needs to be *three* versions of the accumulate operation used
    + * for the remaining 2 lanes.
    + *
    + * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap
    + * nearly perfectly.
      */
    +
     XXH_FORCE_INLINE void
     XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
                         const void* XXH_RESTRICT input,
    @@ -4073,101 +5167,182 @@ XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
     {
         XXH_ASSERT((((size_t)acc) & 15) == 0);
         XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
    -    {
    -        uint64x2_t* const xacc = (uint64x2_t *) acc;
    +    {   /* GCC for darwin arm64 does not like aliasing here */
    +        xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc;
             /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
    -        uint8_t const* const xinput = (const uint8_t *) input;
    -        uint8_t const* const xsecret  = (const uint8_t *) secret;
    +        uint8_t const* xinput = (const uint8_t *) input;
    +        uint8_t const* xsecret  = (const uint8_t *) secret;
     
             size_t i;
    -        /* NEON for the first few lanes (these loops are normally interleaved) */
    -        for (i=0; i < XXH3_NEON_LANES / 2; i++) {
    +#ifdef __wasm_simd128__
    +        /*
    +         * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret
    +         * is constant propagated, which results in it converting it to this
    +         * inside the loop:
    +         *
    +         *    a = v128.load(XXH3_kSecret +  0 + $secret_offset, offset = 0)
    +         *    b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0)
    +         *    ...
    +         *
    +         * This requires a full 32-bit address immediate (and therefore a 6 byte
    +         * instruction) as well as an add for each offset.
    +         *
    +         * Putting an asm guard prevents it from folding (at the cost of losing
    +         * the alignment hint), and uses the free offset in `v128.load` instead
    +         * of adding secret_offset each time which overall reduces code size by
    +         * about a kilobyte and improves performance.
    +         */
    +        XXH_COMPILER_GUARD(xsecret);
    +#endif
    +        /* Scalar lanes use the normal scalarRound routine */
    +        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
    +            XXH3_scalarRound(acc, input, secret, i);
    +        }
    +        i = 0;
    +        /* 4 NEON lanes at a time. */
    +        for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
                 /* data_vec = xinput[i]; */
    -            uint8x16_t data_vec    = vld1q_u8(xinput  + (i * 16));
    +            uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput  + (i * 16));
    +            uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput  + ((i+1) * 16));
                 /* key_vec  = xsecret[i];  */
    -            uint8x16_t key_vec     = vld1q_u8(xsecret + (i * 16));
    -            uint64x2_t data_key;
    -            uint32x2_t data_key_lo, data_key_hi;
    -            /* xacc[i] += swap(data_vec); */
    -            uint64x2_t const data64  = vreinterpretq_u64_u8(data_vec);
    -            uint64x2_t const swapped = vextq_u64(data64, data64, 1);
    -            xacc[i] = vaddq_u64 (xacc[i], swapped);
    +            uint64x2_t key_vec_1  = XXH_vld1q_u64(xsecret + (i * 16));
    +            uint64x2_t key_vec_2  = XXH_vld1q_u64(xsecret + ((i+1) * 16));
    +            /* data_swap = swap(data_vec) */
    +            uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
    +            uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
                 /* data_key = data_vec ^ key_vec; */
    -            data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
    -            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
    -             * data_key_hi = (uint32x2_t) (data_key >> 32);
    -             * data_key = UNDEFINED; */
    -            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
    -            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
    -            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
    +            uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
    +            uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
     
    +            /*
    +             * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
    +             * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
    +             * get one vector with the low 32 bits of each lane, and one vector
    +             * with the high 32 bits of each lane.
    +             *
    +             * The intrinsic returns a double vector because the original ARMv7-a
    +             * instruction modified both arguments in place. AArch64 and SIMD128 emit
    +             * two instructions from this intrinsic.
    +             *
    +             *  [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
    +             *  [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
    +             */
    +            uint32x4x2_t unzipped = vuzpq_u32(
    +                vreinterpretq_u32_u64(data_key_1),
    +                vreinterpretq_u32_u64(data_key_2)
    +            );
    +            /* data_key_lo = data_key & 0xFFFFFFFF */
    +            uint32x4_t data_key_lo = unzipped.val[0];
    +            /* data_key_hi = data_key >> 32 */
    +            uint32x4_t data_key_hi = unzipped.val[1];
    +            /*
    +             * Then, we can split the vectors horizontally and multiply which, as for most
    +             * widening intrinsics, have a variant that works on both high half vectors
    +             * for free on AArch64. A similar instruction is available on SIMD128.
    +             *
    +             * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
    +             */
    +            uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi);
    +            uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi);
    +            /*
    +             * Clang reorders
    +             *    a += b * c;     // umlal   swap.2d, dkl.2s, dkh.2s
    +             *    c += a;         // add     acc.2d, acc.2d, swap.2d
    +             * to
    +             *    c += a;         // add     acc.2d, acc.2d, swap.2d
    +             *    c += b * c;     // umlal   acc.2d, dkl.2s, dkh.2s
    +             *
    +             * While it would make sense in theory since the addition is faster,
    +             * for reasons likely related to umlal being limited to certain NEON
    +             * pipelines, this is worse. A compiler guard fixes this.
    +             */
    +            XXH_COMPILER_GUARD_CLANG_NEON(sum_1);
    +            XXH_COMPILER_GUARD_CLANG_NEON(sum_2);
    +            /* xacc[i] = acc_vec + sum; */
    +            xacc[i]   = vaddq_u64(xacc[i], sum_1);
    +            xacc[i+1] = vaddq_u64(xacc[i+1], sum_2);
             }
    -        /* Scalar for the remainder. This may be a zero iteration loop. */
    -        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
    -            XXH3_scalarRound(acc, input, secret, i);
    +        /* Operate on the remaining NEON lanes 2 at a time. */
    +        for (; i < XXH3_NEON_LANES / 2; i++) {
    +            /* data_vec = xinput[i]; */
    +            uint64x2_t data_vec = XXH_vld1q_u64(xinput  + (i * 16));
    +            /* key_vec  = xsecret[i];  */
    +            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
    +            /* acc_vec_2 = swap(data_vec) */
    +            uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1);
    +            /* data_key = data_vec ^ key_vec; */
    +            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
    +            /* For two lanes, just use VMOVN and VSHRN. */
    +            /* data_key_lo = data_key & 0xFFFFFFFF; */
    +            uint32x2_t data_key_lo = vmovn_u64(data_key);
    +            /* data_key_hi = data_key >> 32; */
    +            uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
    +            /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */
    +            uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi);
    +            /* Same Clang workaround as before */
    +            XXH_COMPILER_GUARD_CLANG_NEON(sum);
    +            /* xacc[i] = acc_vec + sum; */
    +            xacc[i] = vaddq_u64 (xacc[i], sum);
             }
         }
     }
    +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)
     
     XXH_FORCE_INLINE void
     XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
     {
         XXH_ASSERT((((size_t)acc) & 15) == 0);
     
    -    {   uint64x2_t* xacc       = (uint64x2_t*) acc;
    +    {   xxh_aliasing_uint64x2_t* xacc       = (xxh_aliasing_uint64x2_t*) acc;
             uint8_t const* xsecret = (uint8_t const*) secret;
    -        uint32x2_t prime       = vdup_n_u32 (XXH_PRIME32_1);
     
             size_t i;
    -        /* NEON for the first few lanes (these loops are normally interleaved) */
    +        /* WASM uses operator overloads and doesn't need these. */
    +#ifndef __wasm_simd128__
    +        /* { prime32_1, prime32_1 } */
    +        uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1);
    +        /* { 0, prime32_1, 0, prime32_1 } */
    +        uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32));
    +#endif
    +
    +        /* AArch64 uses both scalar and neon at the same time */
    +        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
    +            XXH3_scalarScrambleRound(acc, secret, i);
    +        }
             for (i=0; i < XXH3_NEON_LANES / 2; i++) {
                 /* xacc[i] ^= (xacc[i] >> 47); */
                 uint64x2_t acc_vec  = xacc[i];
    -            uint64x2_t shifted  = vshrq_n_u64 (acc_vec, 47);
    -            uint64x2_t data_vec = veorq_u64   (acc_vec, shifted);
    +            uint64x2_t shifted  = vshrq_n_u64(acc_vec, 47);
    +            uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
     
                 /* xacc[i] ^= xsecret[i]; */
    -            uint8x16_t key_vec  = vld1q_u8    (xsecret + (i * 16));
    -            uint64x2_t data_key = veorq_u64   (data_vec, vreinterpretq_u64_u8(key_vec));
    -
    +            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
    +            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
                 /* xacc[i] *= XXH_PRIME32_1 */
    -            uint32x2_t data_key_lo, data_key_hi;
    -            /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
    -             * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
    -             * xacc[i] = UNDEFINED; */
    -            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
    -            {   /*
    -                 * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
    -                 *
    -                 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
    -                 * incorrectly "optimize" this:
    -                 *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
    -                 *   shifted = vshll_n_u32(tmp, 32);
    -                 * to this:
    -                 *   tmp     = "vmulq_u64"(a, b); // no such thing!
    -                 *   shifted = vshlq_n_u64(tmp, 32);
    -                 *
    -                 * However, unlike SSE, Clang lacks a 64-bit multiply routine
    -                 * for NEON, and it scalarizes two 64-bit multiplies instead.
    -                 *
    -                 * vmull_u32 has the same timing as vmul_u32, and it avoids
    -                 * this bug completely.
    -                 * See https://bugs.llvm.org/show_bug.cgi?id=39967
    -                 */
    -                uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
    -                /* xacc[i] = prod_hi << 32; */
    -                xacc[i] = vshlq_n_u64(prod_hi, 32);
    -                /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
    -                xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
    -            }
    -        }
    -        /* Scalar for the remainder. This may be a zero iteration loop. */
    -        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
    -            XXH3_scalarScrambleRound(acc, secret, i);
    +#ifdef __wasm_simd128__
    +            /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */
    +            xacc[i] = data_key * XXH_PRIME32_1;
    +#else
    +            /*
    +             * Expanded version with portable NEON intrinsics
    +             *
    +             *    lo(x) * lo(y) + (hi(x) * lo(y) << 32)
    +             *
    +             * prod_hi = hi(data_key) * lo(prime) << 32
    +             *
    +             * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector
    +             * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits
    +             * and avoid the shift.
    +             */
    +            uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi);
    +            /* Extract low bits for vmlal_u32  */
    +            uint32x2_t data_key_lo = vmovn_u64(data_key);
    +            /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
    +            xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
    +#endif
             }
         }
     }
    -
     #endif
     
     #if (XXH_VECTOR == XXH_VSX)
    @@ -4178,23 +5353,23 @@ XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
                         const void* XXH_RESTRICT secret)
     {
         /* presumed aligned */
    -    unsigned int* const xacc = (unsigned int*) acc;
    -    xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
    -    xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
    +    xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
    +    xxh_u8 const* const xinput   = (xxh_u8 const*) input;   /* no alignment restriction */
    +    xxh_u8 const* const xsecret  = (xxh_u8 const*) secret;    /* no alignment restriction */
         xxh_u64x2 const v32 = { 32, 32 };
         size_t i;
         for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
             /* data_vec = xinput[i]; */
    -        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
    +        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i);
             /* key_vec = xsecret[i]; */
    -        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
    +        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
             xxh_u64x2 const data_key = data_vec ^ key_vec;
             /* shuffled = (data_key << 32) | (data_key >> 32); */
             xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
             /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
             xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
             /* acc_vec = xacc[i]; */
    -        xxh_u64x2 acc_vec        = (xxh_u64x2)vec_xl(0, xacc + 4 * i);
    +        xxh_u64x2 acc_vec        = xacc[i];
             acc_vec += product;
     
             /* swap high and low halves */
    @@ -4203,18 +5378,18 @@ XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
     #else
             acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
     #endif
    -        /* xacc[i] = acc_vec; */
    -        vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);
    +        xacc[i] = acc_vec;
         }
     }
    +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)
     
     XXH_FORCE_INLINE void
     XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
     {
         XXH_ASSERT((((size_t)acc) & 15) == 0);
     
    -    {         xxh_u64x2* const xacc    =       (xxh_u64x2*) acc;
    -        const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
    +    {   xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
    +        const xxh_u8* const xsecret = (const xxh_u8*) secret;
             /* constants */
             xxh_u64x2 const v32  = { 32, 32 };
             xxh_u64x2 const v47 = { 47, 47 };
    @@ -4226,7 +5401,7 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
                 xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
     
                 /* xacc[i] ^= xsecret[i]; */
    -            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
    +            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
                 xxh_u64x2 const data_key = data_vec ^ key_vec;
     
                 /* xacc[i] *= XXH_PRIME32_1 */
    @@ -4240,8 +5415,148 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
     
     #endif
     
    +#if (XXH_VECTOR == XXH_SVE)
    +
    +XXH_FORCE_INLINE void
    +XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
    +                   const void* XXH_RESTRICT input,
    +                   const void* XXH_RESTRICT secret)
    +{
    +    uint64_t *xacc = (uint64_t *)acc;
    +    const uint64_t *xinput = (const uint64_t *)(const void *)input;
    +    const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
    +    svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
    +    uint64_t element_count = svcntd();
    +    if (element_count >= 8) {
    +        svbool_t mask = svptrue_pat_b64(SV_VL8);
    +        svuint64_t vacc = svld1_u64(mask, xacc);
    +        ACCRND(vacc, 0);
    +        svst1_u64(mask, xacc, vacc);
    +    } else if (element_count == 2) {   /* sve128 */
    +        svbool_t mask = svptrue_pat_b64(SV_VL2);
    +        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
    +        svuint64_t acc1 = svld1_u64(mask, xacc + 2);
    +        svuint64_t acc2 = svld1_u64(mask, xacc + 4);
    +        svuint64_t acc3 = svld1_u64(mask, xacc + 6);
    +        ACCRND(acc0, 0);
    +        ACCRND(acc1, 2);
    +        ACCRND(acc2, 4);
    +        ACCRND(acc3, 6);
    +        svst1_u64(mask, xacc + 0, acc0);
    +        svst1_u64(mask, xacc + 2, acc1);
    +        svst1_u64(mask, xacc + 4, acc2);
    +        svst1_u64(mask, xacc + 6, acc3);
    +    } else {
    +        svbool_t mask = svptrue_pat_b64(SV_VL4);
    +        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
    +        svuint64_t acc1 = svld1_u64(mask, xacc + 4);
    +        ACCRND(acc0, 0);
    +        ACCRND(acc1, 4);
    +        svst1_u64(mask, xacc + 0, acc0);
    +        svst1_u64(mask, xacc + 4, acc1);
    +    }
    +}
    +
    +XXH_FORCE_INLINE void
    +XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
    +               const xxh_u8* XXH_RESTRICT input,
    +               const xxh_u8* XXH_RESTRICT secret,
    +               size_t nbStripes)
    +{
    +    if (nbStripes != 0) {
    +        uint64_t *xacc = (uint64_t *)acc;
    +        const uint64_t *xinput = (const uint64_t *)(const void *)input;
    +        const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
    +        svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
    +        uint64_t element_count = svcntd();
    +        if (element_count >= 8) {
    +            svbool_t mask = svptrue_pat_b64(SV_VL8);
    +            svuint64_t vacc = svld1_u64(mask, xacc + 0);
    +            do {
    +                /* svprfd(svbool_t, void *, enum svfprop); */
    +                svprfd(mask, xinput + 128, SV_PLDL1STRM);
    +                ACCRND(vacc, 0);
    +                xinput += 8;
    +                xsecret += 1;
    +                nbStripes--;
    +           } while (nbStripes != 0);
    +
    +           svst1_u64(mask, xacc + 0, vacc);
    +        } else if (element_count == 2) { /* sve128 */
    +            svbool_t mask = svptrue_pat_b64(SV_VL2);
    +            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
    +            svuint64_t acc1 = svld1_u64(mask, xacc + 2);
    +            svuint64_t acc2 = svld1_u64(mask, xacc + 4);
    +            svuint64_t acc3 = svld1_u64(mask, xacc + 6);
    +            do {
    +                svprfd(mask, xinput + 128, SV_PLDL1STRM);
    +                ACCRND(acc0, 0);
    +                ACCRND(acc1, 2);
    +                ACCRND(acc2, 4);
    +                ACCRND(acc3, 6);
    +                xinput += 8;
    +                xsecret += 1;
    +                nbStripes--;
    +           } while (nbStripes != 0);
    +
    +           svst1_u64(mask, xacc + 0, acc0);
    +           svst1_u64(mask, xacc + 2, acc1);
    +           svst1_u64(mask, xacc + 4, acc2);
    +           svst1_u64(mask, xacc + 6, acc3);
    +        } else {
    +            svbool_t mask = svptrue_pat_b64(SV_VL4);
    +            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
    +            svuint64_t acc1 = svld1_u64(mask, xacc + 4);
    +            do {
    +                svprfd(mask, xinput + 128, SV_PLDL1STRM);
    +                ACCRND(acc0, 0);
    +                ACCRND(acc1, 4);
    +                xinput += 8;
    +                xsecret += 1;
    +                nbStripes--;
    +           } while (nbStripes != 0);
    +
    +           svst1_u64(mask, xacc + 0, acc0);
    +           svst1_u64(mask, xacc + 4, acc1);
    +       }
    +    }
    +}
    +
    +#endif
    +
     /* scalar variants - universal */
     
    +#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
    +/*
    + * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they
    + * emit an excess mask and a full 64-bit multiply-add (MADD X-form).
    + *
    + * While this might not seem like much, as AArch64 is a 64-bit architecture, only
    + * big Cortex designs have a full 64-bit multiplier.
    + *
    + * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit
    + * multiplies expand to 2-3 multiplies in microcode. This has a major penalty
    + * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline.
    + *
    + * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does
    + * not have this penalty and does the mask automatically.
    + */
    +XXH_FORCE_INLINE xxh_u64
    +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
    +{
    +    xxh_u64 ret;
    +    /* note: %x = 64-bit register, %w = 32-bit register */
    +    __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc));
    +    return ret;
    +}
    +#else
    +XXH_FORCE_INLINE xxh_u64
    +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
    +{
    +    return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc;
    +}
    +#endif
    +
     /*!
      * @internal
      * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
    @@ -4264,7 +5579,7 @@ XXH3_scalarRound(void* XXH_RESTRICT acc,
             xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
             xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
             xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
    -        xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
    +        xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]);
         }
     }
     
    @@ -4278,10 +5593,18 @@ XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
                          const void* XXH_RESTRICT secret)
     {
         size_t i;
    +    /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */
    +#if defined(__GNUC__) && !defined(__clang__) \
    +  && (defined(__arm__) || defined(__thumb2__)) \
    +  && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \
    +  && XXH_SIZE_OPT <= 0
    +#  pragma GCC unroll 8
    +#endif
         for (i=0; i < XXH_ACC_NB; i++) {
             XXH3_scalarRound(acc, input, secret, i);
         }
     }
    +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)
     
     /*!
      * @internal
    @@ -4333,10 +5656,10 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
         const xxh_u8* kSecretPtr = XXH3_kSecret;
         XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
     
    -#if defined(__clang__) && defined(__aarch64__)
    +#if defined(__GNUC__) && defined(__aarch64__)
         /*
          * UGLY HACK:
    -     * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
    +     * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are
          * placed sequentially, in order, at the top of the unrolled loop.
          *
          * While MOVK is great for generating constants (2 cycles for a 64-bit
    @@ -4351,7 +5674,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
          * ADD
          * SUB      STR
          *          STR
    -     * By forcing loads from memory (as the asm line causes Clang to assume
    +     * By forcing loads from memory (as the asm line causes the compiler to assume
          * that XXH3_kSecretPtr has been changed), the pipelines are used more
          * efficiently:
          *   I   L   S
    @@ -4368,17 +5691,11 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
          */
         XXH_COMPILER_GUARD(kSecretPtr);
     #endif
    -    /*
    -     * Note: in debug mode, this overrides the asm optimization
    -     * and Clang will emit MOVK chains again.
    -     */
    -    XXH_ASSERT(kSecretPtr == XXH3_kSecret);
    -
         {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
             int i;
             for (i=0; i < nbRounds; i++) {
                 /*
    -             * The asm hack causes Clang to assume that kSecretPtr aliases with
    +             * The asm hack causes the compiler to assume that kSecretPtr aliases with
                  * customSecret, and on aarch64, this prevented LDP from merging two
                  * loads together for free. Putting the loads together before the stores
                  * properly generates LDP.
    @@ -4391,7 +5708,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
     }
     
     
    -typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*);
    +typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
     typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
     typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
     
    @@ -4399,82 +5716,63 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
     #if (XXH_VECTOR == XXH_AVX512)
     
     #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
    +#define XXH3_accumulate     XXH3_accumulate_avx512
     #define XXH3_scrambleAcc    XXH3_scrambleAcc_avx512
     #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
     
     #elif (XXH_VECTOR == XXH_AVX2)
     
     #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
    +#define XXH3_accumulate     XXH3_accumulate_avx2
     #define XXH3_scrambleAcc    XXH3_scrambleAcc_avx2
     #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
     
     #elif (XXH_VECTOR == XXH_SSE2)
     
     #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
    +#define XXH3_accumulate     XXH3_accumulate_sse2
     #define XXH3_scrambleAcc    XXH3_scrambleAcc_sse2
     #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
     
     #elif (XXH_VECTOR == XXH_NEON)
     
     #define XXH3_accumulate_512 XXH3_accumulate_512_neon
    +#define XXH3_accumulate     XXH3_accumulate_neon
     #define XXH3_scrambleAcc    XXH3_scrambleAcc_neon
     #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
     
     #elif (XXH_VECTOR == XXH_VSX)
     
     #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
    +#define XXH3_accumulate     XXH3_accumulate_vsx
     #define XXH3_scrambleAcc    XXH3_scrambleAcc_vsx
     #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
     
    +#elif (XXH_VECTOR == XXH_SVE)
    +#define XXH3_accumulate_512 XXH3_accumulate_512_sve
    +#define XXH3_accumulate     XXH3_accumulate_sve
    +#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
    +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
    +
     #else /* scalar */
     
     #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
    +#define XXH3_accumulate     XXH3_accumulate_scalar
     #define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
     #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
     
     #endif
     
    -
    -
    -#ifndef XXH_PREFETCH_DIST
    -#  ifdef __clang__
    -#    define XXH_PREFETCH_DIST 320
    -#  else
    -#    if (XXH_VECTOR == XXH_AVX512)
    -#      define XXH_PREFETCH_DIST 512
    -#    else
    -#      define XXH_PREFETCH_DIST 384
    -#    endif
    -#  endif  /* __clang__ */
    -#endif  /* XXH_PREFETCH_DIST */
    -
    -/*
    - * XXH3_accumulate()
    - * Loops over XXH3_accumulate_512().
    - * Assumption: nbStripes will not overflow the secret size
    - */
    -XXH_FORCE_INLINE void
    -XXH3_accumulate(     xxh_u64* XXH_RESTRICT acc,
    -                const xxh_u8* XXH_RESTRICT input,
    -                const xxh_u8* XXH_RESTRICT secret,
    -                      size_t nbStripes,
    -                      XXH3_f_accumulate_512 f_acc512)
    -{
    -    size_t n;
    -    for (n = 0; n < nbStripes; n++ ) {
    -        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
    -        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
    -        f_acc512(acc,
    -                 in,
    -                 secret + n*XXH_SECRET_CONSUME_RATE);
    -    }
    -}
    +#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */
    +#  undef XXH3_initCustomSecret
    +#  define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
    +#endif
     
     XXH_FORCE_INLINE void
     XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
                           const xxh_u8* XXH_RESTRICT input, size_t len,
                           const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
    -                            XXH3_f_accumulate_512 f_acc512,
    +                            XXH3_f_accumulate f_acc,
                                 XXH3_f_scrambleAcc f_scramble)
     {
         size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
    @@ -4486,7 +5784,7 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
         XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
     
         for (n = 0; n < nb_blocks; n++) {
    -        XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);
    +        f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
             f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
         }
     
    @@ -4494,12 +5792,12 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
         XXH_ASSERT(len > XXH_STRIPE_LEN);
         {   size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
             XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
    -        XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);
    +        f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);
     
             /* last stripe */
             {   const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
     #define XXH_SECRET_LASTACC_START 7  /* not aligned on 8, last secret is different from acc & scrambler */
    -            f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
    +            XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
         }   }
     }
     
    @@ -4544,12 +5842,12 @@ XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secre
     XXH_FORCE_INLINE XXH64_hash_t
     XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
                                const void* XXH_RESTRICT secret, size_t secretSize,
    -                           XXH3_f_accumulate_512 f_acc512,
    +                           XXH3_f_accumulate f_acc,
                                XXH3_f_scrambleAcc f_scramble)
     {
         XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
     
    -    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble);
    +    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);
     
         /* converge into final hash */
         XXH_STATIC_ASSERT(sizeof(acc) == 64);
    @@ -4563,13 +5861,15 @@ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
      * It's important for performance to transmit secret's size (when it's static)
      * so that the compiler can properly optimize the vectorized loop.
      * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
    + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
    + * breaks -Og, this is XXH_NO_INLINE.
      */
    -XXH_FORCE_INLINE XXH64_hash_t
    +XXH3_WITH_SECRET_INLINE XXH64_hash_t
     XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
                                  XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
     {
         (void)seed64;
    -    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);
    +    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
     }
     
     /*
    @@ -4578,12 +5878,12 @@ XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
      * Note that inside this no_inline function, we do inline the internal loop,
      * and provide a statically defined secret size to allow optimization of vector loop.
      */
    -XXH_NO_INLINE XXH64_hash_t
    +XXH_NO_INLINE XXH_PUREF XXH64_hash_t
     XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
                               XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
     {
         (void)seed64; (void)secret; (void)secretLen;
    -    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);
    +    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
     }
     
     /*
    @@ -4600,18 +5900,20 @@ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
     XXH_FORCE_INLINE XXH64_hash_t
     XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
                                         XXH64_hash_t seed,
    -                                    XXH3_f_accumulate_512 f_acc512,
    +                                    XXH3_f_accumulate f_acc,
                                         XXH3_f_scrambleAcc f_scramble,
                                         XXH3_f_initCustomSecret f_initSec)
     {
    +#if XXH_SIZE_OPT <= 0
         if (seed == 0)
             return XXH3_hashLong_64b_internal(input, len,
                                               XXH3_kSecret, sizeof(XXH3_kSecret),
    -                                          f_acc512, f_scramble);
    +                                          f_acc, f_scramble);
    +#endif
         {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
             f_initSec(secret, seed);
             return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
    -                                          f_acc512, f_scramble);
    +                                          f_acc, f_scramble);
         }
     }
     
    @@ -4619,12 +5921,12 @@ XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
      * It's important for performance that XXH3_hashLong is not inlined.
      */
     XXH_NO_INLINE XXH64_hash_t
    -XXH3_hashLong_64b_withSeed(const void* input, size_t len,
    -                           XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)
    +XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
    +                           XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
     {
         (void)secret; (void)secretLen;
         return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
    -                XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
    +                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
     }
     
     
    @@ -4656,37 +5958,37 @@ XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
     
     /* ===   Public entry point   === */
     
    -/*! @ingroup xxh3_family */
    -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
    +/*! @ingroup XXH3_family */
    +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)
     {
    -    return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
    +    return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH64_hash_t
    -XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
    +XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)
     {
    -    return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
    +    return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH64_hash_t
    -XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
    +XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)
     {
    -    return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
    +    return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
     }
     
     XXH_PUBLIC_API XXH64_hash_t
    -XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
    +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
     {
    -    if (len <= XXH3_MIDSIZE_MAX)
    -        return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
    -    return XXH3_hashLong_64b_withSecret(input, len, seed, (const xxh_u8*)secret, secretSize);
    +    if (length <= XXH3_MIDSIZE_MAX)
    +        return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
    +    return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize);
     }
     
     
     /* ===   XXH3 streaming   === */
    -
    +#ifndef XXH_NO_STREAM
     /*
      * Malloc's a pointer that is always aligned to align.
      *
    @@ -4710,7 +6012,7 @@ XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret,
      *
      * Align must be a power of 2 and 8 <= align <= 128.
      */
    -static void* XXH_alignedMalloc(size_t s, size_t align)
    +static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align)
     {
         XXH_ASSERT(align <= 128 && align >= 8); /* range check */
         XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
    @@ -4752,7 +6054,15 @@ static void XXH_alignedFree(void* p)
             XXH_free(base);
         }
     }
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
    +/*!
    + * @brief Allocate an @ref XXH3_state_t.
    + *
    + * @return An allocated pointer of @ref XXH3_state_t on success.
    + * @return `NULL` on failure.
    + *
    + * @note Must be freed with XXH3_freeState().
    + */
     XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
     {
         XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
    @@ -4761,16 +6071,25 @@ XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
         return state;
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
    +/*!
    + * @brief Frees an @ref XXH3_state_t.
    + *
    + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
    + *
    + * @return @ref XXH_OK.
    + *
    + * @note Must be allocated with XXH3_createState().
    + */
     XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
     {
         XXH_alignedFree(statePtr);
         return XXH_OK;
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API void
    -XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
    +XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)
     {
         XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
     }
    @@ -4802,18 +6121,18 @@ XXH3_reset_internal(XXH3_state_t* statePtr,
         statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH_errorcode
    -XXH3_64bits_reset(XXH3_state_t* statePtr)
    +XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
     {
         if (statePtr == NULL) return XXH_ERROR;
         XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
         return XXH_OK;
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH_errorcode
    -XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
    +XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
     {
         if (statePtr == NULL) return XXH_ERROR;
         XXH3_reset_internal(statePtr, 0, secret, secretSize);
    @@ -4822,9 +6141,9 @@ XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t
         return XXH_OK;
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH_errorcode
    -XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
    +XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
     {
         if (statePtr == NULL) return XXH_ERROR;
         if (seed==0) return XXH3_64bits_reset(statePtr);
    @@ -4834,9 +6153,9 @@ XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
         return XXH_OK;
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH_errorcode
    -XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)
    +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)
     {
         if (statePtr == NULL) return XXH_ERROR;
         if (secret == NULL) return XXH_ERROR;
    @@ -4846,35 +6165,61 @@ XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret,
         return XXH_OK;
     }
     
    -/* Note : when XXH3_consumeStripes() is invoked,
    - * there must be a guarantee that at least one more byte must be consumed from input
    - * so that the function can blindly consume all stripes using the "normal" secret segment */
    -XXH_FORCE_INLINE void
    +/*!
    + * @internal
    + * @brief Processes a large input for XXH3_update() and XXH3_digest_long().
    + *
    + * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block.
    + *
    + * @param acc                Pointer to the 8 accumulator lanes
    + * @param nbStripesSoFarPtr  In/out pointer to the number of leftover stripes in the block*
    + * @param nbStripesPerBlock  Number of stripes in a block
    + * @param input              Input pointer
    + * @param nbStripes          Number of stripes to process
    + * @param secret             Secret pointer
    + * @param secretLimit        Offset of the last block in @p secret
    + * @param f_acc              Pointer to an XXH3_accumulate implementation
    + * @param f_scramble         Pointer to an XXH3_scrambleAcc implementation
    + * @return                   Pointer past the end of @p input after processing
    + */
    +XXH_FORCE_INLINE const xxh_u8 *
     XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
                         size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
                         const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
                         const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
    -                    XXH3_f_accumulate_512 f_acc512,
    +                    XXH3_f_accumulate f_acc,
                         XXH3_f_scrambleAcc f_scramble)
     {
    -    XXH_ASSERT(nbStripes <= nbStripesPerBlock);  /* can handle max 1 scramble per invocation */
    -    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
    -    if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
    -        /* need a scrambling operation */
    -        size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
    -        size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
    -        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);
    -        f_scramble(acc, secret + secretLimit);
    -        XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);
    -        *nbStripesSoFarPtr = nbStripesAfterBlock;
    -    } else {
    -        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
    +    const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE;
    +    /* Process full blocks */
    +    if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) {
    +        /* Process the initial partial block... */
    +        size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr;
    +
    +        do {
    +            /* Accumulate and scramble */
    +            f_acc(acc, input, initialSecret, nbStripesThisIter);
    +            f_scramble(acc, secret + secretLimit);
    +            input += nbStripesThisIter * XXH_STRIPE_LEN;
    +            nbStripes -= nbStripesThisIter;
    +            /* Then continue the loop with the full block size */
    +            nbStripesThisIter = nbStripesPerBlock;
    +            initialSecret = secret;
    +        } while (nbStripes >= nbStripesPerBlock);
    +        *nbStripesSoFarPtr = 0;
    +    }
    +    /* Process a partial block */
    +    if (nbStripes > 0) {
    +        f_acc(acc, input, initialSecret, nbStripes);
    +        input += nbStripes * XXH_STRIPE_LEN;
             *nbStripesSoFarPtr += nbStripes;
         }
    +    /* Return end pointer */
    +    return input;
     }
     
     #ifndef XXH3_STREAM_USE_STACK
    -# ifndef __clang__ /* clang doesn't need additional stack space */
    +# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */
     #   define XXH3_STREAM_USE_STACK 1
     # endif
     #endif
    @@ -4884,7 +6229,7 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
     XXH_FORCE_INLINE XXH_errorcode
     XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
                 const xxh_u8* XXH_RESTRICT input, size_t len,
    -            XXH3_f_accumulate_512 f_acc512,
    +            XXH3_f_accumulate f_acc,
                 XXH3_f_scrambleAcc f_scramble)
     {
         if (input==NULL) {
    @@ -4900,7 +6245,8 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
              * when operating accumulators directly into state.
              * Operating into stack space seems to enable proper optimization.
              * clang, on the other hand, doesn't seem to need this trick */
    -        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc));
    +        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
    +        XXH_memcpy(acc, state->acc, sizeof(acc));
     #else
             xxh_u64* XXH_RESTRICT const acc = state->acc;
     #endif
    @@ -4908,7 +6254,7 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
             XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
     
             /* small input : just fill in tmp buffer */
    -        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {
    +        if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) {
                 XXH_memcpy(state->buffer + state->bufferedSize, input, len);
                 state->bufferedSize += (XXH32_hash_t)len;
                 return XXH_OK;
    @@ -4930,57 +6276,20 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
                                    &state->nbStripesSoFar, state->nbStripesPerBlock,
                                     state->buffer, XXH3_INTERNALBUFFER_STRIPES,
                                     secret, state->secretLimit,
    -                                f_acc512, f_scramble);
    +                                f_acc, f_scramble);
                 state->bufferedSize = 0;
             }
             XXH_ASSERT(input < bEnd);
    -
    -        /* large input to consume : ingest per full block */
    -        if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
    +        if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
                 size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
    -            XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);
    -            /* join to current block's end */
    -            {   size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
    -                XXH_ASSERT(nbStripesToEnd <= nbStripes);
    -                XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);
    -                f_scramble(acc, secret + state->secretLimit);
    -                state->nbStripesSoFar = 0;
    -                input += nbStripesToEnd * XXH_STRIPE_LEN;
    -                nbStripes -= nbStripesToEnd;
    -            }
    -            /* consume per entire blocks */
    -            while(nbStripes >= state->nbStripesPerBlock) {
    -                XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);
    -                f_scramble(acc, secret + state->secretLimit);
    -                input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
    -                nbStripes -= state->nbStripesPerBlock;
    -            }
    -            /* consume last partial block */
    -            XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);
    -            input += nbStripes * XXH_STRIPE_LEN;
    -            XXH_ASSERT(input < bEnd);  /* at least some bytes left */
    -            state->nbStripesSoFar = nbStripes;
    -            /* buffer predecessor of last partial stripe */
    -            XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
    -            XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
    -        } else {
    -            /* content to consume <= block size */
    -            /* Consume input by a multiple of internal buffer size */
    -            if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
    -                const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
    -                do {
    -                    XXH3_consumeStripes(acc,
    +            input = XXH3_consumeStripes(acc,
                                            &state->nbStripesSoFar, state->nbStripesPerBlock,
    -                                        input, XXH3_INTERNALBUFFER_STRIPES,
    -                                        secret, state->secretLimit,
    -                                        f_acc512, f_scramble);
    -                    input += XXH3_INTERNALBUFFER_SIZE;
    -                } while (inputbuffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
    -            }
    -        }
    +                                       input, nbStripes,
    +                                       secret, state->secretLimit,
    +                                       f_acc, f_scramble);
    +            XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
     
    +        }
             /* Some remaining input (always) : buffer it */
             XXH_ASSERT(input < bEnd);
             XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
    @@ -4989,19 +6298,19 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
             state->bufferedSize = (XXH32_hash_t)(bEnd-input);
     #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
             /* save stack accumulators into state */
    -        memcpy(state->acc, acc, sizeof(acc));
    +        XXH_memcpy(state->acc, acc, sizeof(acc));
     #endif
         }
     
         return XXH_OK;
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH_errorcode
    -XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
    +XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
     {
         return XXH3_update(state, (const xxh_u8*)input, len,
    -                       XXH3_accumulate_512, XXH3_scrambleAcc);
    +                       XXH3_accumulate, XXH3_scrambleAcc);
     }
     
     
    @@ -5010,37 +6319,40 @@ XXH3_digest_long (XXH64_hash_t* acc,
                       const XXH3_state_t* state,
                       const unsigned char* secret)
     {
    +    xxh_u8 lastStripe[XXH_STRIPE_LEN];
    +    const xxh_u8* lastStripePtr;
    +
         /*
          * Digest on a local copy. This way, the state remains unaltered, and it can
          * continue ingesting more input afterwards.
          */
         XXH_memcpy(acc, state->acc, sizeof(state->acc));
         if (state->bufferedSize >= XXH_STRIPE_LEN) {
    +        /* Consume remaining stripes then point to remaining data in buffer */
             size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
             size_t nbStripesSoFar = state->nbStripesSoFar;
             XXH3_consumeStripes(acc,
                                &nbStripesSoFar, state->nbStripesPerBlock,
                                 state->buffer, nbStripes,
                                 secret, state->secretLimit,
    -                            XXH3_accumulate_512, XXH3_scrambleAcc);
    -        /* last stripe */
    -        XXH3_accumulate_512(acc,
    -                            state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
    -                            secret + state->secretLimit - XXH_SECRET_LASTACC_START);
    +                            XXH3_accumulate, XXH3_scrambleAcc);
    +        lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN;
         } else {  /* bufferedSize < XXH_STRIPE_LEN */
    -        xxh_u8 lastStripe[XXH_STRIPE_LEN];
    +        /* Copy to temp buffer */
             size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
             XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
             XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
             XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
    -        XXH3_accumulate_512(acc,
    -                            lastStripe,
    -                            secret + state->secretLimit - XXH_SECRET_LASTACC_START);
    +        lastStripePtr = lastStripe;
         }
    +    /* Last stripe */
    +    XXH3_accumulate_512(acc,
    +                        lastStripePtr,
    +                        secret + state->secretLimit - XXH_SECRET_LASTACC_START);
     }
     
    -/*! @ingroup xxh3_family */
    -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
    +/*! @ingroup XXH3_family */
    +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
     {
         const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
         if (state->totalLen > XXH3_MIDSIZE_MAX) {
    @@ -5056,7 +6368,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
         return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
                                       secret, state->secretLimit + XXH_STRIPE_LEN);
     }
    -
    +#endif /* !XXH_NO_STREAM */
     
     
     /* ==========================================
    @@ -5076,7 +6388,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
      * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
      */
     
    -XXH_FORCE_INLINE XXH128_hash_t
    +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
     XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
     {
         /* A doubled version of 1to3_64b with different constants. */
    @@ -5105,7 +6417,7 @@ XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_
         }
     }
     
    -XXH_FORCE_INLINE XXH128_hash_t
    +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
     XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
     {
         XXH_ASSERT(input != NULL);
    @@ -5125,14 +6437,14 @@ XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_
             m128.low64  ^= (m128.high64 >> 3);
     
             m128.low64   = XXH_xorshift64(m128.low64, 35);
    -        m128.low64  *= 0x9FB21C651E98DF25ULL;
    +        m128.low64  *= PRIME_MX2;
             m128.low64   = XXH_xorshift64(m128.low64, 28);
             m128.high64  = XXH3_avalanche(m128.high64);
             return m128;
         }
     }
     
    -XXH_FORCE_INLINE XXH128_hash_t
    +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
     XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
     {
         XXH_ASSERT(input != NULL);
    @@ -5207,7 +6519,7 @@ XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64
     /*
      * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
      */
    -XXH_FORCE_INLINE XXH128_hash_t
    +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
     XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
     {
         XXH_ASSERT(len <= 16);
    @@ -5238,7 +6550,7 @@ XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
     }
     
     
    -XXH_FORCE_INLINE XXH128_hash_t
    +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
     XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
                           const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
                           XXH64_hash_t seed)
    @@ -5249,6 +6561,16 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
         {   XXH128_hash_t acc;
             acc.low64 = len * XXH_PRIME64_1;
             acc.high64 = 0;
    +
    +#if XXH_SIZE_OPT >= 1
    +        {
    +            /* Smaller, but slightly slower. */
    +            unsigned int i = (unsigned int)(len - 1) / 32;
    +            do {
    +                acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
    +            } while (i-- != 0);
    +        }
    +#else
             if (len > 32) {
                 if (len > 64) {
                     if (len > 96) {
    @@ -5259,6 +6581,7 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
                 acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
             }
             acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
    +#endif
             {   XXH128_hash_t h128;
                 h128.low64  = acc.low64 + acc.high64;
                 h128.high64 = (acc.low64    * XXH_PRIME64_1)
    @@ -5271,7 +6594,7 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
         }
     }
     
    -XXH_NO_INLINE XXH128_hash_t
    +XXH_NO_INLINE XXH_PUREF XXH128_hash_t
     XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
                            XXH64_hash_t seed)
    @@ -5280,25 +6603,34 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
         XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
     
         {   XXH128_hash_t acc;
    -        int const nbRounds = (int)len / 32;
    -        int i;
    +        unsigned i;
             acc.low64 = len * XXH_PRIME64_1;
             acc.high64 = 0;
    -        for (i=0; i<4; i++) {
    +        /*
    +         *  We set as `i` as offset + 32. We do this so that unchanged
    +         * `len` can be used as upper bound. This reaches a sweet spot
    +         * where both x86 and aarch64 get simple agen and good codegen
    +         * for the loop.
    +         */
    +        for (i = 32; i < 160; i += 32) {
                 acc = XXH128_mix32B(acc,
    -                                input  + (32 * i),
    -                                input  + (32 * i) + 16,
    -                                secret + (32 * i),
    +                                input  + i - 32,
    +                                input  + i - 16,
    +                                secret + i - 32,
                                     seed);
             }
             acc.low64 = XXH3_avalanche(acc.low64);
             acc.high64 = XXH3_avalanche(acc.high64);
    -        XXH_ASSERT(nbRounds >= 4);
    -        for (i=4 ; i < nbRounds; i++) {
    +        /*
    +         * NB: `i <= len` will duplicate the last 32-bytes if
    +         * len % 32 was zero. This is an unfortunate necessity to keep
    +         * the hash result stable.
    +         */
    +        for (i=160; i <= len; i += 32) {
                 acc = XXH128_mix32B(acc,
    -                                input + (32 * i),
    -                                input + (32 * i) + 16,
    -                                secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
    +                                input + i - 32,
    +                                input + i - 16,
    +                                secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
                                     seed);
             }
             /* last bytes */
    @@ -5306,7 +6638,7 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
                                 input + len - 16,
                                 input + len - 32,
                                 secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
    -                            0ULL - seed);
    +                            (XXH64_hash_t)0 - seed);
     
             {   XXH128_hash_t h128;
                 h128.low64  = acc.low64 + acc.high64;
    @@ -5323,12 +6655,12 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
     XXH_FORCE_INLINE XXH128_hash_t
     XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
                                 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
    -                            XXH3_f_accumulate_512 f_acc512,
    +                            XXH3_f_accumulate f_acc,
                                 XXH3_f_scrambleAcc f_scramble)
     {
         XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
     
    -    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);
    +    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);
     
         /* converge into final hash */
         XXH_STATIC_ASSERT(sizeof(acc) == 64);
    @@ -5346,47 +6678,50 @@ XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
     }
     
     /*
    - * It's important for performance that XXH3_hashLong is not inlined.
    + * It's important for performance that XXH3_hashLong() is not inlined.
      */
    -XXH_NO_INLINE XXH128_hash_t
    +XXH_NO_INLINE XXH_PUREF XXH128_hash_t
     XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
                                XXH64_hash_t seed64,
                                const void* XXH_RESTRICT secret, size_t secretLen)
     {
         (void)seed64; (void)secret; (void)secretLen;
         return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
    -                                       XXH3_accumulate_512, XXH3_scrambleAcc);
    +                                       XXH3_accumulate, XXH3_scrambleAcc);
     }
     
     /*
    - * It's important for performance to pass @secretLen (when it's static)
    + * It's important for performance to pass @p secretLen (when it's static)
      * to the compiler, so that it can properly optimize the vectorized loop.
    + *
    + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
    + * breaks -Og, this is XXH_NO_INLINE.
      */
    -XXH_FORCE_INLINE XXH128_hash_t
    +XXH3_WITH_SECRET_INLINE XXH128_hash_t
     XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
                                   XXH64_hash_t seed64,
                                   const void* XXH_RESTRICT secret, size_t secretLen)
     {
         (void)seed64;
         return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
    -                                       XXH3_accumulate_512, XXH3_scrambleAcc);
    +                                       XXH3_accumulate, XXH3_scrambleAcc);
     }
     
     XXH_FORCE_INLINE XXH128_hash_t
     XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
                                     XXH64_hash_t seed64,
    -                                XXH3_f_accumulate_512 f_acc512,
    +                                XXH3_f_accumulate f_acc,
                                     XXH3_f_scrambleAcc f_scramble,
                                     XXH3_f_initCustomSecret f_initSec)
     {
         if (seed64 == 0)
             return XXH3_hashLong_128b_internal(input, len,
                                                XXH3_kSecret, sizeof(XXH3_kSecret),
    -                                           f_acc512, f_scramble);
    +                                           f_acc, f_scramble);
         {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
             f_initSec(secret, seed64);
             return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
    -                                           f_acc512, f_scramble);
    +                                           f_acc, f_scramble);
         }
     }
     
    @@ -5399,7 +6734,7 @@ XXH3_hashLong_128b_withSeed(const void* input, size_t len,
     {
         (void)secret; (void)secretLen;
         return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
    -                XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
    +                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
     }
     
     typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
    @@ -5429,94 +6764,93 @@ XXH3_128bits_internal(const void* input, size_t len,
     
     /* ===   Public XXH128 API   === */
     
    -/*! @ingroup xxh3_family */
    -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
    +/*! @ingroup XXH3_family */
    +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)
     {
         return XXH3_128bits_internal(input, len, 0,
                                      XXH3_kSecret, sizeof(XXH3_kSecret),
                                      XXH3_hashLong_128b_default);
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH128_hash_t
    -XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
    +XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)
     {
         return XXH3_128bits_internal(input, len, 0,
                                      (const xxh_u8*)secret, secretSize,
                                      XXH3_hashLong_128b_withSecret);
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH128_hash_t
    -XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
    +XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
     {
         return XXH3_128bits_internal(input, len, seed,
                                      XXH3_kSecret, sizeof(XXH3_kSecret),
                                      XXH3_hashLong_128b_withSeed);
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH128_hash_t
    -XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
    +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
     {
         if (len <= XXH3_MIDSIZE_MAX)
             return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
         return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH128_hash_t
    -XXH128(const void* input, size_t len, XXH64_hash_t seed)
    +XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
     {
         return XXH3_128bits_withSeed(input, len, seed);
     }
     
     
     /* ===   XXH3 128-bit streaming   === */
    -
    +#ifndef XXH_NO_STREAM
     /*
      * All initialization and update functions are identical to 64-bit streaming variant.
      * The only difference is the finalization routine.
      */
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH_errorcode
    -XXH3_128bits_reset(XXH3_state_t* statePtr)
    +XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
     {
         return XXH3_64bits_reset(statePtr);
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH_errorcode
    -XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
    +XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
     {
         return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH_errorcode
    -XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
    +XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
     {
         return XXH3_64bits_reset_withSeed(statePtr, seed);
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH_errorcode
    -XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)
    +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
     {
         return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH_errorcode
    -XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
    +XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
     {
    -    return XXH3_update(state, (const xxh_u8*)input, len,
    -                       XXH3_accumulate_512, XXH3_scrambleAcc);
    +    return XXH3_64bits_update(state, input, len);
     }
     
    -/*! @ingroup xxh3_family */
    -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
    +/*! @ingroup XXH3_family */
    +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
     {
         const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
         if (state->totalLen > XXH3_MIDSIZE_MAX) {
    @@ -5540,13 +6874,13 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
         return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
                                        secret, state->secretLimit + XXH_STRIPE_LEN);
     }
    -
    +#endif /* !XXH_NO_STREAM */
     /* 128-bit utility functions */
     
     #include    /* memcmp, memcpy */
     
     /* return : 1 is equal, 0 if different */
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
     {
         /* note : XXH128_hash_t is compact, it has no padding byte */
    @@ -5554,11 +6888,11 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
     }
     
     /* This prototype is compatible with stdlib's qsort().
    - * return : >0 if *h128_1  > *h128_2
    - *          <0 if *h128_1  < *h128_2
    - *          =0 if *h128_1 == *h128_2  */
    -/*! @ingroup xxh3_family */
    -XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
    + * @return : >0 if *h128_1  > *h128_2
    + *           <0 if *h128_1  < *h128_2
    + *           =0 if *h128_1 == *h128_2  */
    +/*! @ingroup XXH3_family */
    +XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)
     {
         XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
         XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
    @@ -5570,9 +6904,9 @@ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
     
     
     /*======   Canonical representation   ======*/
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API void
    -XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
    +XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)
     {
         XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
         if (XXH_CPU_LITTLE_ENDIAN) {
    @@ -5583,9 +6917,9 @@ XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
         XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH128_hash_t
    -XXH128_hashFromCanonical(const XXH128_canonical_t* src)
    +XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)
     {
         XXH128_hash_t h;
         h.high64 = XXH_readBE64(src);
    @@ -5607,9 +6941,9 @@ XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
         XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API XXH_errorcode
    -XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)
    +XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)
     {
     #if (XXH_DEBUGLEVEL >= 1)
         XXH_ASSERT(secretBuffer != NULL);
    @@ -5652,9 +6986,9 @@ XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSee
         return XXH_OK;
     }
     
    -/*! @ingroup xxh3_family */
    +/*! @ingroup XXH3_family */
     XXH_PUBLIC_API void
    -XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
    +XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)
     {
         XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
         XXH3_initCustomSecret(secret, seed);
    @@ -5667,7 +7001,7 @@ XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
     /* Pop our optimization override from above */
     #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
       && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
    -  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
    +  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
     #  pragma GCC pop_options
     #endif
     
    @@ -5682,5 +7016,5 @@ XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
     
     
     #if defined (__cplusplus)
    -}
    +} /* extern "C" */
     #endif
    diff --git a/third-party/zstd/lib/common/zstd_internal.h b/third-party/zstd/lib/common/zstd_internal.h
    index 1f942f27..ecb9cfba 100644
    --- a/third-party/zstd/lib/common/zstd_internal.h
    +++ b/third-party/zstd/lib/common/zstd_internal.h
    @@ -178,7 +178,7 @@ static void ZSTD_copy8(void* dst, const void* src) {
         ZSTD_memcpy(dst, src, 8);
     #endif
     }
    -#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
    +#define COPY8(d,s) do { ZSTD_copy8(d,s); d+=8; s+=8; } while (0)
     
     /* Need to use memmove here since the literal buffer can now be located within
        the dst buffer. In circumstances where the op "catches up" to where the
    @@ -198,7 +198,7 @@ static void ZSTD_copy16(void* dst, const void* src) {
         ZSTD_memcpy(dst, copy16_buf, 16);
     #endif
     }
    -#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
    +#define COPY16(d,s) do { ZSTD_copy16(d,s); d+=16; s+=16; } while (0)
     
     #define WILDCOPY_OVERLENGTH 32
     #define WILDCOPY_VECLEN 16
    @@ -227,7 +227,7 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
         if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
             /* Handle short offset copies. */
             do {
    -            COPY8(op, ip)
    +            COPY8(op, ip);
             } while (op < oend);
         } else {
             assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
    @@ -366,13 +366,13 @@ typedef struct {
     
     /*! ZSTD_getcBlockSize() :
      *  Provides the size of compressed block from block header `src` */
    -/* Used by: decompress, fullbench (does not get its definition from here) */
    +/*  Used by: decompress, fullbench */
     size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
                               blockProperties_t* bpPtr);
     
     /*! ZSTD_decodeSeqHeaders() :
      *  decode sequence header from src */
    -/* Used by: decompress, fullbench (does not get its definition from here) */
    +/*  Used by: zstd_decompress_block, fullbench */
     size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                            const void* src, size_t srcSize);
     
    diff --git a/third-party/zstd/lib/compress/fse_compress.c b/third-party/zstd/lib/compress/fse_compress.c
    index 5d377080..1ce3cf16 100644
    --- a/third-party/zstd/lib/compress/fse_compress.c
    +++ b/third-party/zstd/lib/compress/fse_compress.c
    @@ -25,7 +25,7 @@
     #include "../common/error_private.h"
     #define ZSTD_DEPS_NEED_MALLOC
     #define ZSTD_DEPS_NEED_MATH64
    -#include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
    +#include "../common/zstd_deps.h"  /* ZSTD_memset */
     #include "../common/bits.h" /* ZSTD_highbit32 */
     
     
    @@ -225,8 +225,8 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
         size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog
                                        + 4 /* bitCount initialized at 4 */
                                        + 2 /* first two symbols may use one additional bit each */) / 8)
    -                                    + 1 /* round up to whole nb bytes */
    -                                    + 2 /* additional two bytes for bitstream flush */;
    +                                   + 1 /* round up to whole nb bytes */
    +                                   + 2 /* additional two bytes for bitstream flush */;
         return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
     }
     
    @@ -255,7 +255,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
         /* Init */
         remaining = tableSize+1;   /* +1 for extra accuracy */
         threshold = tableSize;
    -    nbBits = tableLog+1;
    +    nbBits = (int)tableLog+1;
     
         while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
             if (previousIs0) {
    @@ -274,7 +274,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
                 }
                 while (symbol >= start+3) {
                     start+=3;
    -                bitStream += 3 << bitCount;
    +                bitStream += 3U << bitCount;
                     bitCount += 2;
                 }
                 bitStream += (symbol-start) << bitCount;
    @@ -294,7 +294,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
                 count++;   /* +1 for extra accuracy */
                 if (count>=threshold)
                     count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
    -            bitStream += count << bitCount;
    +            bitStream += (U32)count << bitCount;
                 bitCount  += nbBits;
                 bitCount  -= (count>8);
         out+= (bitCount+7) /8;
     
    -    return (out-ostart);
    +    assert(out >= ostart);
    +    return (size_t)(out-ostart);
     }
     
     
    diff --git a/third-party/zstd/lib/compress/huf_compress.c b/third-party/zstd/lib/compress/huf_compress.c
    index 29871877..ea000723 100644
    --- a/third-party/zstd/lib/compress/huf_compress.c
    +++ b/third-party/zstd/lib/compress/huf_compress.c
    @@ -220,6 +220,25 @@ static void HUF_setValue(HUF_CElt* elt, size_t value)
         }
     }
     
    +HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable)
    +{
    +    HUF_CTableHeader header;
    +    ZSTD_memcpy(&header, ctable, sizeof(header));
    +    return header;
    +}
    +
    +static void HUF_writeCTableHeader(HUF_CElt* ctable, U32 tableLog, U32 maxSymbolValue)
    +{
    +    HUF_CTableHeader header;
    +    HUF_STATIC_ASSERT(sizeof(ctable[0]) == sizeof(header));
    +    ZSTD_memset(&header, 0, sizeof(header));
    +    assert(tableLog < 256);
    +    header.tableLog = (BYTE)tableLog;
    +    assert(maxSymbolValue < 256);
    +    header.maxSymbolValue = (BYTE)maxSymbolValue;
    +    ZSTD_memcpy(ctable, &header, sizeof(header));
    +}
    +
     typedef struct {
         HUF_CompressWeightsWksp wksp;
         BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
    @@ -237,6 +256,9 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
     
         HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
     
    +    assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue);
    +    assert(HUF_readCTableHeader(CTable).tableLog == huffLog);
    +
         /* check conditions */
         if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
         if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
    @@ -283,7 +305,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
         if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
         if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
     
    -    CTable[0] = tableLog;
    +    *maxSymbolValuePtr = nbSymbols - 1;
    +
    +    HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr);
     
         /* Prepare base value per rank */
         {   U32 n, nextRankStart = 0;
    @@ -315,7 +339,6 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
             { U32 n; for (n=0; n HUF_readCTableHeader(CTable).maxSymbolValue)
    +        return 0;
         return (U32)HUF_getNbBits(ct[symbolValue]);
     }
     
    @@ -723,7 +748,8 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
             HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits);   /* push nbBits per symbol, symbol order */
         for (n=0; n 11)
    @@ -1255,7 +1288,7 @@ unsigned HUF_optimalTableLog(
     
         {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
             size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
    -        size_t maxBits, hSize, newSize;
    +        size_t hSize, newSize;
             const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
             const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
             size_t optSize = ((size_t) ~0) - 1;
    @@ -1266,12 +1299,14 @@ unsigned HUF_optimalTableLog(
             /* Search until size increases */
             for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
                 DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
    -            maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
    -            if (ERR_isError(maxBits)) continue;
     
    -            if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
    +            {   size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
    +                if (ERR_isError(maxBits)) continue;
    +
    +                if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
     
    -            hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
    +                hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
    +            }
     
                 if (ERR_isError(hSize)) continue;
     
    @@ -1372,12 +1407,6 @@ HUF_compress_internal (void* dst, size_t dstSize,
             huffLog = (U32)maxBits;
             DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
         }
    -    /* Zero unused symbols in CTable, so we can check it for validity */
    -    {
    -        size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
    -        size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
    -        ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
    -    }
     
         /* Write table description header */
         {   CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, table->CTable, maxSymbolValue, huffLog,
    @@ -1420,7 +1449,7 @@ size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
     /* HUF_compress4X_repeat():
      * compress input using 4 streams.
      * consider skipping quickly
    - * re-use an existing huffman compression table */
    + * reuse an existing huffman compression table */
     size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
                           const void* src, size_t srcSize,
                           unsigned maxSymbolValue, unsigned huffLog,
    diff --git a/third-party/zstd/lib/compress/zstd_compress.c b/third-party/zstd/lib/compress/zstd_compress.c
    index d6133e70..9284e2a4 100644
    --- a/third-party/zstd/lib/compress/zstd_compress.c
    +++ b/third-party/zstd/lib/compress/zstd_compress.c
    @@ -178,6 +178,7 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
     
     size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
     {
    +    DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx);
         if (cctx==NULL) return 0;   /* support free on NULL */
         RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
                         "not compatible with static CCtx");
    @@ -649,10 +650,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value)
         return 0;
     }
     
    -#define BOUNDCHECK(cParam, val) { \
    -    RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \
    -                    parameter_outOfBound, "Param out of bounds"); \
    -}
    +#define BOUNDCHECK(cParam, val)                                       \
    +    do {                                                              \
    +        RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val),        \
    +                        parameter_outOfBound, "Param out of bounds"); \
    +    } while (0)
     
     
     static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
    @@ -868,7 +870,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
     #else
             FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
             CCtxParams->nbWorkers = value;
    -        return CCtxParams->nbWorkers;
    +        return (size_t)(CCtxParams->nbWorkers);
     #endif
     
         case ZSTD_c_jobSize :
    @@ -892,7 +894,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
     #else
             FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), "");
             CCtxParams->overlapLog = value;
    -        return CCtxParams->overlapLog;
    +        return (size_t)CCtxParams->overlapLog;
     #endif
     
         case ZSTD_c_rsyncable :
    @@ -902,7 +904,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
     #else
             FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), "");
             CCtxParams->rsyncable = value;
    -        return CCtxParams->rsyncable;
    +        return (size_t)CCtxParams->rsyncable;
     #endif
     
         case ZSTD_c_enableDedicatedDictSearch :
    @@ -939,8 +941,10 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
             return CCtxParams->ldmParams.hashRateLog;
     
         case ZSTD_c_targetCBlockSize :
    -        if (value!=0)   /* 0 ==> default */
    +        if (value!=0) {  /* 0 ==> default */
    +            value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN);
                 BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
    +        }
             CCtxParams->targetCBlockSize = (U32)value;
             return CCtxParams->targetCBlockSize;
     
    @@ -968,7 +972,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
         case ZSTD_c_validateSequences:
             BOUNDCHECK(ZSTD_c_validateSequences, value);
             CCtxParams->validateSequences = value;
    -        return CCtxParams->validateSequences;
    +        return (size_t)CCtxParams->validateSequences;
     
         case ZSTD_c_useBlockSplitter:
             BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
    @@ -983,7 +987,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
         case ZSTD_c_deterministicRefPrefix:
             BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
             CCtxParams->deterministicRefPrefix = !!value;
    -        return CCtxParams->deterministicRefPrefix;
    +        return (size_t)CCtxParams->deterministicRefPrefix;
     
         case ZSTD_c_prefetchCDictTables:
             BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
    @@ -993,7 +997,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
         case ZSTD_c_enableSeqProducerFallback:
             BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
             CCtxParams->enableMatchFinderFallback = value;
    -        return CCtxParams->enableMatchFinderFallback;
    +        return (size_t)CCtxParams->enableMatchFinderFallback;
     
         case ZSTD_c_maxBlockSize:
             if (value!=0)    /* 0 ==> default */
    @@ -1363,7 +1367,6 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
             RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
                             "Reset parameters is only possible during init stage.");
             ZSTD_clearAllDicts(cctx);
    -        ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx));
             return ZSTD_CCtxParams_reset(&cctx->requestedParams);
         }
         return 0;
    @@ -1391,11 +1394,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
     static ZSTD_compressionParameters
     ZSTD_clampCParams(ZSTD_compressionParameters cParams)
     {
    -#   define CLAMP_TYPE(cParam, val, type) {                                \
    -        ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
    -        if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \
    -    }
    +#   define CLAMP_TYPE(cParam, val, type)                                      \
    +        do {                                                                  \
    +            ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
    +            if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \
    +        } while (0)
     #   define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned)
         CLAMP(ZSTD_c_windowLog, cParams.windowLog);
         CLAMP(ZSTD_c_chainLog,  cParams.chainLog);
    @@ -1467,6 +1471,48 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
         const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
         assert(ZSTD_checkCParams(cPar)==0);
     
    +    /* Cascade the selected strategy down to the next-highest one built into
    +     * this binary. */
    +#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
    +    if (cPar.strategy == ZSTD_btultra2) {
    +        cPar.strategy = ZSTD_btultra;
    +    }
    +    if (cPar.strategy == ZSTD_btultra) {
    +        cPar.strategy = ZSTD_btopt;
    +    }
    +#endif
    +#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
    +    if (cPar.strategy == ZSTD_btopt) {
    +        cPar.strategy = ZSTD_btlazy2;
    +    }
    +#endif
    +#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
    +    if (cPar.strategy == ZSTD_btlazy2) {
    +        cPar.strategy = ZSTD_lazy2;
    +    }
    +#endif
    +#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
    +    if (cPar.strategy == ZSTD_lazy2) {
    +        cPar.strategy = ZSTD_lazy;
    +    }
    +#endif
    +#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
    +    if (cPar.strategy == ZSTD_lazy) {
    +        cPar.strategy = ZSTD_greedy;
    +    }
    +#endif
    +#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
    +    if (cPar.strategy == ZSTD_greedy) {
    +        cPar.strategy = ZSTD_dfast;
    +    }
    +#endif
    +#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
    +    if (cPar.strategy == ZSTD_dfast) {
    +        cPar.strategy = ZSTD_fast;
    +        cPar.targetLength = 0;
    +    }
    +#endif
    +
         switch (mode) {
         case ZSTD_cpm_unknown:
         case ZSTD_cpm_noAttachDict:
    @@ -1617,8 +1663,8 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
           + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32))
           + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32))
           + ZSTD_cwksp_aligned_alloc_size((1<strategy, useRowMatchFinder)
                                                 ? ZSTD_cwksp_aligned_alloc_size(hSize)
                                                 : 0;
    @@ -1707,7 +1753,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
          * be needed. However, we still allocate two 0-sized buffers, which can
          * take space under ASAN. */
         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
    -        &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
    +        &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
     }
     
     size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
    @@ -1768,7 +1814,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
     
             return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
                 &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
    -            ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
    +            ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
         }
     }
     
    @@ -2001,8 +2047,8 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
             ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
             ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
             ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
    -        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t));
    -        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
    +        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t));
    +        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
         }
     
         ms->cParams = *cParams;
    @@ -2074,7 +2120,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
     
         {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
             size_t const blockSize = MIN(params->maxBlockSize, windowSize);
    -        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useSequenceProducer);
    +        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params));
             size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
                     ? ZSTD_compressBound(blockSize) + 1
                     : 0;
    @@ -2091,8 +2137,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
             size_t const neededSpace =
                 ZSTD_estimateCCtxSize_usingCCtxParams_internal(
                     ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
    -                buffInSize, buffOutSize, pledgedSrcSize, params->useSequenceProducer, params->maxBlockSize);
    -        int resizeWorkspace;
    +                buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
     
             FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
     
    @@ -2101,7 +2146,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
             {   /* Check if workspace is large enough, alloc a new one if needed */
                 int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;
                 int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace);
    -            resizeWorkspace = workspaceTooSmall || workspaceWasteful;
    +            int resizeWorkspace = workspaceTooSmall || workspaceWasteful;
                 DEBUGLOG(4, "Need %zu B workspace", neededSpace);
                 DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
     
    @@ -2176,10 +2221,10 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
             }
     
             /* reserve space for block-level external sequences */
    -        if (params->useSequenceProducer) {
    +        if (ZSTD_hasExtSeqProd(params)) {
                 size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
    -            zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq;
    -            zc->externalMatchCtx.seqBuffer =
    +            zc->extSeqBufCapacity = maxNbExternalSeq;
    +            zc->extSeqBuf =
                     (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
             }
     
    @@ -2564,7 +2609,7 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa
         assert(size < (1U<<31));   /* can be casted to int */
     
     #if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
    -    /* To validate that the table re-use logic is sound, and that we don't
    +    /* To validate that the table reuse logic is sound, and that we don't
          * access table space that we haven't cleaned, we re-"poison" the table
          * space every time we mark it dirty.
          *
    @@ -2992,40 +3037,43 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS
         static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
             { ZSTD_compressBlock_fast  /* default for 0 */,
               ZSTD_compressBlock_fast,
    -          ZSTD_compressBlock_doubleFast,
    -          ZSTD_compressBlock_greedy,
    -          ZSTD_compressBlock_lazy,
    -          ZSTD_compressBlock_lazy2,
    -          ZSTD_compressBlock_btlazy2,
    -          ZSTD_compressBlock_btopt,
    -          ZSTD_compressBlock_btultra,
    -          ZSTD_compressBlock_btultra2 },
    +          ZSTD_COMPRESSBLOCK_DOUBLEFAST,
    +          ZSTD_COMPRESSBLOCK_GREEDY,
    +          ZSTD_COMPRESSBLOCK_LAZY,
    +          ZSTD_COMPRESSBLOCK_LAZY2,
    +          ZSTD_COMPRESSBLOCK_BTLAZY2,
    +          ZSTD_COMPRESSBLOCK_BTOPT,
    +          ZSTD_COMPRESSBLOCK_BTULTRA,
    +          ZSTD_COMPRESSBLOCK_BTULTRA2
    +        },
             { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
               ZSTD_compressBlock_fast_extDict,
    -          ZSTD_compressBlock_doubleFast_extDict,
    -          ZSTD_compressBlock_greedy_extDict,
    -          ZSTD_compressBlock_lazy_extDict,
    -          ZSTD_compressBlock_lazy2_extDict,
    -          ZSTD_compressBlock_btlazy2_extDict,
    -          ZSTD_compressBlock_btopt_extDict,
    -          ZSTD_compressBlock_btultra_extDict,
    -          ZSTD_compressBlock_btultra_extDict },
    +          ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT,
    +          ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT,
    +          ZSTD_COMPRESSBLOCK_LAZY_EXTDICT,
    +          ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT,
    +          ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT,
    +          ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT,
    +          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT,
    +          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT
    +        },
             { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
               ZSTD_compressBlock_fast_dictMatchState,
    -          ZSTD_compressBlock_doubleFast_dictMatchState,
    -          ZSTD_compressBlock_greedy_dictMatchState,
    -          ZSTD_compressBlock_lazy_dictMatchState,
    -          ZSTD_compressBlock_lazy2_dictMatchState,
    -          ZSTD_compressBlock_btlazy2_dictMatchState,
    -          ZSTD_compressBlock_btopt_dictMatchState,
    -          ZSTD_compressBlock_btultra_dictMatchState,
    -          ZSTD_compressBlock_btultra_dictMatchState },
    +          ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE,
    +          ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE,
    +          ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE,
    +          ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE,
    +          ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE,
    +          ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE,
    +          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE,
    +          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE
    +        },
             { NULL  /* default for 0 */,
               NULL,
               NULL,
    -          ZSTD_compressBlock_greedy_dedicatedDictSearch,
    -          ZSTD_compressBlock_lazy_dedicatedDictSearch,
    -          ZSTD_compressBlock_lazy2_dedicatedDictSearch,
    +          ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH,
    +          ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH,
    +          ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH,
               NULL,
               NULL,
               NULL,
    @@ -3038,18 +3086,26 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS
         DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
         if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) {
             static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = {
    -            { ZSTD_compressBlock_greedy_row,
    -            ZSTD_compressBlock_lazy_row,
    -            ZSTD_compressBlock_lazy2_row },
    -            { ZSTD_compressBlock_greedy_extDict_row,
    -            ZSTD_compressBlock_lazy_extDict_row,
    -            ZSTD_compressBlock_lazy2_extDict_row },
    -            { ZSTD_compressBlock_greedy_dictMatchState_row,
    -            ZSTD_compressBlock_lazy_dictMatchState_row,
    -            ZSTD_compressBlock_lazy2_dictMatchState_row },
    -            { ZSTD_compressBlock_greedy_dedicatedDictSearch_row,
    -            ZSTD_compressBlock_lazy_dedicatedDictSearch_row,
    -            ZSTD_compressBlock_lazy2_dedicatedDictSearch_row }
    +            {
    +                ZSTD_COMPRESSBLOCK_GREEDY_ROW,
    +                ZSTD_COMPRESSBLOCK_LAZY_ROW,
    +                ZSTD_COMPRESSBLOCK_LAZY2_ROW
    +            },
    +            {
    +                ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW,
    +                ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW,
    +                ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW
    +            },
    +            {
    +                ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW,
    +                ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW,
    +                ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW
    +            },
    +            {
    +                ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW,
    +                ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW,
    +                ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW
    +            }
             };
             DEBUGLOG(4, "Selecting a row-based matchfinder");
             assert(useRowMatchFinder != ZSTD_ps_auto);
    @@ -3192,7 +3248,7 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                 /* External matchfinder + LDM is technically possible, just not implemented yet.
                  * We need to revisit soon and implement it. */
                 RETURN_ERROR_IF(
    -                zc->appliedParams.useSequenceProducer,
    +                ZSTD_hasExtSeqProd(&zc->appliedParams),
                     parameter_combination_unsupported,
                     "Long-distance matching with external sequence producer enabled is not currently supported."
                 );
    @@ -3211,7 +3267,7 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                 /* External matchfinder + LDM is technically possible, just not implemented yet.
                  * We need to revisit soon and implement it. */
                 RETURN_ERROR_IF(
    -                zc->appliedParams.useSequenceProducer,
    +                ZSTD_hasExtSeqProd(&zc->appliedParams),
                     parameter_combination_unsupported,
                     "Long-distance matching with external sequence producer enabled is not currently supported."
                 );
    @@ -3230,18 +3286,18 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                                            zc->appliedParams.useRowMatchFinder,
                                            src, srcSize);
                 assert(ldmSeqStore.pos == ldmSeqStore.size);
    -        } else if (zc->appliedParams.useSequenceProducer) {
    +        } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) {
                 assert(
    -                zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize)
    +                zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize)
                 );
    -            assert(zc->externalMatchCtx.mFinder != NULL);
    +            assert(zc->appliedParams.extSeqProdFunc != NULL);
     
                 {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
     
    -                size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)(
    -                    zc->externalMatchCtx.mState,
    -                    zc->externalMatchCtx.seqBuffer,
    -                    zc->externalMatchCtx.seqBufferCapacity,
    +                size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)(
    +                    zc->appliedParams.extSeqProdState,
    +                    zc->extSeqBuf,
    +                    zc->extSeqBufCapacity,
                         src, srcSize,
                         NULL, 0,  /* dict and dictSize, currently not supported */
                         zc->appliedParams.compressionLevel,
    @@ -3249,21 +3305,21 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                     );
     
                     size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
    -                    zc->externalMatchCtx.seqBuffer,
    +                    zc->extSeqBuf,
                         nbExternalSeqs,
    -                    zc->externalMatchCtx.seqBufferCapacity,
    +                    zc->extSeqBufCapacity,
                         srcSize
                     );
     
                     /* Return early if there is no error, since we don't need to worry about last literals */
                     if (!ZSTD_isError(nbPostProcessedSeqs)) {
                         ZSTD_sequencePosition seqPos = {0,0,0};
    -                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs);
    +                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs);
                         RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
                         FORWARD_IF_ERROR(
                             ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
                                 zc, &seqPos,
    -                            zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs,
    +                            zc->extSeqBuf, nbPostProcessedSeqs,
                                 src, srcSize,
                                 zc->appliedParams.searchForExternalRepcodes
                             ),
    @@ -3280,9 +3336,11 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                     }
     
                     /* Fallback to software matchfinder */
    -                {   ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
    -                                                                                            zc->appliedParams.useRowMatchFinder,
    -                                                                                            dictMode);
    +                {   ZSTD_blockCompressor const blockCompressor =
    +                        ZSTD_selectBlockCompressor(
    +                            zc->appliedParams.cParams.strategy,
    +                            zc->appliedParams.useRowMatchFinder,
    +                            dictMode);
                         ms->ldmSeqStore = NULL;
                         DEBUGLOG(
                             5,
    @@ -3292,9 +3350,10 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                         lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
                 }   }
             } else {   /* not long range mode and no external matchfinder */
    -            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
    -                                                                                    zc->appliedParams.useRowMatchFinder,
    -                                                                                    dictMode);
    +            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(
    +                    zc->appliedParams.cParams.strategy,
    +                    zc->appliedParams.useRowMatchFinder,
    +                    dictMode);
                 ms->ldmSeqStore = NULL;
                 lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
             }
    @@ -3304,29 +3363,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
         return ZSTDbss_compress;
     }
     
    -static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
    +static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM])
     {
    -    const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
    -    const seqDef* seqStoreSeqs = seqStore->sequencesStart;
    -    size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
    -    size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
    -    size_t literalsRead = 0;
    -    size_t lastLLSize;
    +    const seqDef* inSeqs = seqStore->sequencesStart;
    +    const size_t nbInSequences = seqStore->sequences - inSeqs;
    +    const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart);
     
    -    ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
    +    ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex;
    +    const size_t nbOutSequences = nbInSequences + 1;
    +    size_t nbOutLiterals = 0;
    +    repcodes_t repcodes;
         size_t i;
    -    repcodes_t updatedRepcodes;
     
    -    assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
    -    /* Ensure we have enough space for last literals "sequence" */
    -    assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
    -    ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
    -    for (i = 0; i < seqStoreSeqSize; ++i) {
    -        U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
    -        outSeqs[i].litLength = seqStoreSeqs[i].litLength;
    -        outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;
    +    /* Bounds check that we have enough space for every input sequence
    +     * and the block delimiter
    +     */
    +    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
    +    RETURN_ERROR_IF(
    +        nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex),
    +        dstSize_tooSmall,
    +        "Not enough space to copy sequences");
    +
    +    ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes));
    +    for (i = 0; i < nbInSequences; ++i) {
    +        U32 rawOffset;
    +        outSeqs[i].litLength = inSeqs[i].litLength;
    +        outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH;
             outSeqs[i].rep = 0;
     
    +        /* Handle the possible single length >= 64K
    +         * There can only be one because we add MINMATCH to every match length,
    +         * and blocks are at most 128K.
    +         */
             if (i == seqStore->longLengthPos) {
                 if (seqStore->longLengthType == ZSTD_llt_literalLength) {
                     outSeqs[i].litLength += 0x10000;
    @@ -3335,41 +3403,55 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
                 }
             }
     
    -        if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {
    -            /* Derive the correct offset corresponding to a repcode */
    -            outSeqs[i].rep = seqStoreSeqs[i].offBase;
    +        /* Determine the raw offset given the offBase, which may be a repcode. */
    +        if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) {
    +            const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase);
    +            assert(repcode > 0);
    +            outSeqs[i].rep = repcode;
                 if (outSeqs[i].litLength != 0) {
    -                rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];
    +                rawOffset = repcodes.rep[repcode - 1];
                 } else {
    -                if (outSeqs[i].rep == 3) {
    -                    rawOffset = updatedRepcodes.rep[0] - 1;
    +                if (repcode == 3) {
    +                    assert(repcodes.rep[0] > 1);
    +                    rawOffset = repcodes.rep[0] - 1;
                     } else {
    -                    rawOffset = updatedRepcodes.rep[outSeqs[i].rep];
    +                    rawOffset = repcodes.rep[repcode];
                     }
                 }
    +        } else {
    +            rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase);
             }
             outSeqs[i].offset = rawOffset;
    -        /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
    -           so we provide seqStoreSeqs[i].offset - 1 */
    -        ZSTD_updateRep(updatedRepcodes.rep,
    -                       seqStoreSeqs[i].offBase,
    -                       seqStoreSeqs[i].litLength == 0);
    -        literalsRead += outSeqs[i].litLength;
    +
    +        /* Update repcode history for the sequence */
    +        ZSTD_updateRep(repcodes.rep,
    +                       inSeqs[i].offBase,
    +                       inSeqs[i].litLength == 0);
    +
    +        nbOutLiterals += outSeqs[i].litLength;
         }
         /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
          * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
          * for the block boundary, according to the API.
          */
    -    assert(seqStoreLiteralsSize >= literalsRead);
    -    lastLLSize = seqStoreLiteralsSize - literalsRead;
    -    outSeqs[i].litLength = (U32)lastLLSize;
    -    outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
    -    seqStoreSeqSize++;
    -    zc->seqCollector.seqIndex += seqStoreSeqSize;
    +    assert(nbInLiterals >= nbOutLiterals);
    +    {
    +        const size_t lastLLSize = nbInLiterals - nbOutLiterals;
    +        outSeqs[nbInSequences].litLength = (U32)lastLLSize;
    +        outSeqs[nbInSequences].matchLength = 0;
    +        outSeqs[nbInSequences].offset = 0;
    +        assert(nbOutSequences == nbInSequences + 1);
    +    }
    +    seqCollector->seqIndex += nbOutSequences;
    +    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
    +
    +    return 0;
     }
     
     size_t ZSTD_sequenceBound(size_t srcSize) {
    -    return (srcSize / ZSTD_MINMATCH_MIN) + 1;
    +    const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1;
    +    const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1;
    +    return maxNbSeq + maxNbDelims;
     }
     
     size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
    @@ -3378,6 +3460,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
         const size_t dstCapacity = ZSTD_compressBound(srcSize);
         void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
         SeqCollector seqCollector;
    +    {
    +        int targetCBlockSize;
    +        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), "");
    +        RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0");
    +    }
    +    {
    +        int nbWorkers;
    +        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), "");
    +        RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0");
    +    }
     
         RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");
     
    @@ -3387,8 +3479,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
         seqCollector.maxSequences = outSeqsSize;
         zc->seqCollector = seqCollector;
     
    -    ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
    -    ZSTD_customFree(dst, ZSTD_defaultCMem);
    +    {
    +        const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
    +        ZSTD_customFree(dst, ZSTD_defaultCMem);
    +        FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed");
    +    }
    +    assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize));
         return zc->seqCollector.seqIndex;
     }
     
    @@ -3981,8 +4077,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
             cSeqsSize = 1;
         }
     
    +    /* Sequence collection not supported when block splitting */
         if (zc->seqCollector.collectSequences) {
    -        ZSTD_copyBlockSequences(zc);
    +        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed");
             ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
             return 0;
         }
    @@ -4204,6 +4301,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
             if (bss == ZSTDbss_noCompress) {
                 if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
                     zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
    +            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
                 cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
                 FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
                 DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
    @@ -4236,11 +4334,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
     
         {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
             FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
    -        if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
    +        if (bss == ZSTDbss_noCompress) {
    +            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
    +            cSize = 0;
    +            goto out;
    +        }
         }
     
         if (zc->seqCollector.collectSequences) {
    -        ZSTD_copyBlockSequences(zc);
    +        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed");
             ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
             return 0;
         }
    @@ -4553,19 +4655,15 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity)
         }
     }
     
    -size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
    +void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
     {
    -    RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,
    -                    "wrong cctx stage");
    -    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable,
    -                    parameter_unsupported,
    -                    "incompatible with ldm");
    +    assert(cctx->stage == ZSTDcs_init);
    +    assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable);
         cctx->externSeqStore.seq = seq;
         cctx->externSeqStore.size = nbSeq;
         cctx->externSeqStore.capacity = nbSeq;
         cctx->externSeqStore.pos = 0;
         cctx->externSeqStore.posInSequence = 0;
    -    return 0;
     }
     
     
    @@ -4760,12 +4858,19 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
             ZSTD_fillHashTable(ms, iend, dtlm, tfp);
             break;
         case ZSTD_dfast:
    +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
             ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
    +#else
    +        assert(0); /* shouldn't be called: cparams should've been adjusted. */
    +#endif
             break;
     
         case ZSTD_greedy:
         case ZSTD_lazy:
         case ZSTD_lazy2:
    +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR)
             assert(srcSize >= HASH_READ_SIZE);
             if (ms->dedicatedDictSearch) {
                 assert(ms->chainTable != NULL);
    @@ -4782,14 +4887,23 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
                     DEBUGLOG(4, "Using chain-based hash table for lazy dict");
                 }
             }
    +#else
    +        assert(0); /* shouldn't be called: cparams should've been adjusted. */
    +#endif
             break;
     
         case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
         case ZSTD_btopt:
         case ZSTD_btultra:
         case ZSTD_btultra2:
    +#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
             assert(srcSize >= HASH_READ_SIZE);
             ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
    +#else
    +        assert(0); /* shouldn't be called: cparams should've been adjusted. */
    +#endif
             break;
     
         default:
    @@ -4836,11 +4950,10 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
     
             /* We only set the loaded table as valid if it contains all non-zero
              * weights. Otherwise, we set it to check */
    -        if (!hasZeroWeights)
    +        if (!hasZeroWeights && maxSymbolValue == 255)
                 bs->entropy.huf.repeatMode = HUF_repeat_valid;
     
             RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, "");
    -        RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, "");
             dictPtr += hufHeaderSize;
         }
     
    @@ -5107,14 +5220,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
     {
         BYTE* const ostart = (BYTE*)dst;
         BYTE* op = ostart;
    -    size_t fhSize = 0;
     
         DEBUGLOG(4, "ZSTD_writeEpilogue");
         RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing");
     
         /* special case : empty frame */
         if (cctx->stage == ZSTDcs_init) {
    -        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
    +        size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
             FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
             dstCapacity -= fhSize;
             op += fhSize;
    @@ -5124,8 +5236,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
         if (cctx->stage != ZSTDcs_ending) {
             /* write one last empty block, make it the "last" block */
             U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
    -        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue");
    -        MEM_writeLE32(op, cBlockHeader24);
    +        ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3);
    +        RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue");
    +        MEM_writeLE24(op, cBlockHeader24);
             op += ZSTD_blockHeaderSize;
             dstCapacity -= ZSTD_blockHeaderSize;
         }
    @@ -5455,7 +5568,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2(
                             cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch,
                             customMem);
     
    -    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
    +    if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict,
                                         dict, dictSize,
                                         dictLoadMethod, dictContentType,
                                         cctxParams) )) {
    @@ -5879,7 +5992,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
         if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
             assert(input->pos >= zcs->stableIn_notConsumed);
             input->pos -= zcs->stableIn_notConsumed;
    -        ip -= zcs->stableIn_notConsumed;
    +        if (ip) ip -= zcs->stableIn_notConsumed;
             zcs->stableIn_notConsumed = 0;
         }
         if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
    @@ -6138,7 +6251,7 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
     #ifdef ZSTD_MULTITHREAD
         /* If external matchfinder is enabled, make sure to fail before checking job size (for consistency) */
         RETURN_ERROR_IF(
    -        params.useSequenceProducer == 1 && params.nbWorkers >= 1,
    +        ZSTD_hasExtSeqProd(¶ms) && params.nbWorkers >= 1,
             parameter_combination_unsupported,
             "External sequence producer isn't supported with nbWorkers >= 1"
         );
    @@ -6430,7 +6543,7 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
             if (cctx->appliedParams.validateSequences) {
                 seqPos->posInSrc += litLength + matchLength;
                 FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
    -                                                cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
    +                                                cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
                                                     "Sequence validation failed");
             }
             RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
    @@ -6568,7 +6681,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
             if (cctx->appliedParams.validateSequences) {
                 seqPos->posInSrc += litLength + matchLength;
                 FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
    -                                                   cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
    +                                                   cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
                                                        "Sequence validation failed");
             }
             DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
    @@ -7014,19 +7127,27 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
     }
     
     void ZSTD_registerSequenceProducer(
    -    ZSTD_CCtx* zc, void* mState,
    -    ZSTD_sequenceProducer_F* mFinder
    +    ZSTD_CCtx* zc,
    +    void* extSeqProdState,
    +    ZSTD_sequenceProducer_F extSeqProdFunc
    +) {
    +    assert(zc != NULL);
    +    ZSTD_CCtxParams_registerSequenceProducer(
    +        &zc->requestedParams, extSeqProdState, extSeqProdFunc
    +    );
    +}
    +
    +void ZSTD_CCtxParams_registerSequenceProducer(
    +  ZSTD_CCtx_params* params,
    +  void* extSeqProdState,
    +  ZSTD_sequenceProducer_F extSeqProdFunc
     ) {
    -    if (mFinder != NULL) {
    -        ZSTD_externalMatchCtx emctx;
    -        emctx.mState = mState;
    -        emctx.mFinder = mFinder;
    -        emctx.seqBuffer = NULL;
    -        emctx.seqBufferCapacity = 0;
    -        zc->externalMatchCtx = emctx;
    -        zc->requestedParams.useSequenceProducer = 1;
    +    assert(params != NULL);
    +    if (extSeqProdFunc != NULL) {
    +        params->extSeqProdFunc = extSeqProdFunc;
    +        params->extSeqProdState = extSeqProdState;
         } else {
    -        ZSTD_memset(&zc->externalMatchCtx, 0, sizeof(zc->externalMatchCtx));
    -        zc->requestedParams.useSequenceProducer = 0;
    +        params->extSeqProdFunc = NULL;
    +        params->extSeqProdState = NULL;
         }
     }
    diff --git a/third-party/zstd/lib/compress/zstd_compress_internal.h b/third-party/zstd/lib/compress/zstd_compress_internal.h
    index 10f68d01..e41d7b78 100644
    --- a/third-party/zstd/lib/compress/zstd_compress_internal.h
    +++ b/third-party/zstd/lib/compress/zstd_compress_internal.h
    @@ -39,7 +39,7 @@ extern "C" {
                                            It's not a big deal though : candidate will just be sorted again.
                                            Additionally, candidate position 1 will be lost.
                                            But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
    -                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy.
    +                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy.
                                            This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
     
     
    @@ -159,23 +159,24 @@ typedef struct {
     UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
     
     typedef struct {
    -    int price;
    -    U32 off;
    -    U32 mlen;
    -    U32 litlen;
    -    U32 rep[ZSTD_REP_NUM];
    +    int price;  /* price from beginning of segment to this position */
    +    U32 off;    /* offset of previous match */
    +    U32 mlen;   /* length of previous match */
    +    U32 litlen; /* nb of literals since previous match */
    +    U32 rep[ZSTD_REP_NUM];  /* offset history after previous match */
     } ZSTD_optimal_t;
     
     typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
     
    +#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3)
     typedef struct {
         /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
         unsigned* litFreq;           /* table of literals statistics, of size 256 */
         unsigned* litLengthFreq;     /* table of litLength statistics, of size (MaxLL+1) */
         unsigned* matchLengthFreq;   /* table of matchLength statistics, of size (MaxML+1) */
         unsigned* offCodeFreq;       /* table of offCode statistics, of size (MaxOff+1) */
    -    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_NUM+1 */
    -    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
    +    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_SIZE */
    +    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */
     
         U32  litSum;                 /* nb of literals */
         U32  litLengthSum;           /* nb of litLength codes */
    @@ -228,7 +229,7 @@ struct ZSTD_matchState_t {
         U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
         BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
         U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
    -    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for re-use of tag table */
    +    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for reuse of tag table */
         U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
     
         U32* hashTable;
    @@ -360,10 +361,11 @@ struct ZSTD_CCtx_params_s {
          * if the external matchfinder returns an error code. */
         int enableMatchFinderFallback;
     
    -    /* Indicates whether an external matchfinder has been referenced.
    -     * Users can't set this externally.
    -     * It is set internally in ZSTD_registerSequenceProducer(). */
    -    int useSequenceProducer;
    +    /* Parameters for the external sequence producer API.
    +     * Users set these parameters through ZSTD_registerSequenceProducer().
    +     * It is not possible to set these parameters individually through the public API. */
    +    void* extSeqProdState;
    +    ZSTD_sequenceProducer_F extSeqProdFunc;
     
         /* Adjust the max block size*/
         size_t maxBlockSize;
    @@ -401,14 +403,6 @@ typedef struct {
         ZSTD_entropyCTablesMetadata_t entropyMetadata;
     } ZSTD_blockSplitCtx;
     
    -/* Context for block-level external matchfinder API */
    -typedef struct {
    -  void* mState;
    -  ZSTD_sequenceProducer_F* mFinder;
    -  ZSTD_Sequence* seqBuffer;
    -  size_t seqBufferCapacity;
    -} ZSTD_externalMatchCtx;
    -
     struct ZSTD_CCtx_s {
         ZSTD_compressionStage_e stage;
         int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
    @@ -479,8 +473,9 @@ struct ZSTD_CCtx_s {
         /* Workspace for block splitter */
         ZSTD_blockSplitCtx blockSplitCtx;
     
    -    /* Workspace for external matchfinder */
    -    ZSTD_externalMatchCtx externalMatchCtx;
    +    /* Buffer for output from external sequence producer */
    +    ZSTD_Sequence* extSeqBuf;
    +    size_t extSeqBufCapacity;
     };
     
     typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
    @@ -1053,7 +1048,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
      * The least significant cycleLog bits of the indices must remain the same,
      * which may be 0. Every index up to maxDist in the past must be valid.
      */
    -MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
    +MEM_STATIC
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
                                                U32 maxDist, void const* src)
     {
         /* preemptive overflow correction:
    @@ -1246,7 +1243,9 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
      * forget about the extDict. Handles overlap of the prefix and extDict.
      * Returns non-zero if the segment is contiguous.
      */
    -MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
    +MEM_STATIC
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +U32 ZSTD_window_update(ZSTD_window_t* window,
                                       void const* src, size_t srcSize,
                                       int forceNonContiguous)
     {
    @@ -1467,11 +1466,10 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
      * This cannot be used when long range matching is enabled.
      * Zstd will use these sequences, and pass the literals to a secondary block
      * compressor.
    - * @return : An error code on failure.
      * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
      * access and data corruption.
      */
    -size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
    +void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
     
     /** ZSTD_cycleLog() :
      *  condition for correct operation : hashLog > 1 */
    @@ -1509,6 +1507,10 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
                                        const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
     
    +/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */
    +MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) {
    +    return params->extSeqProdFunc != NULL;
    +}
     
     /* ===============================================================
      * Deprecated definitions that are still used internally to avoid
    diff --git a/third-party/zstd/lib/compress/zstd_compress_superblock.c b/third-party/zstd/lib/compress/zstd_compress_superblock.c
    index 638c4acb..628a2dcc 100644
    --- a/third-party/zstd/lib/compress/zstd_compress_superblock.c
    +++ b/third-party/zstd/lib/compress/zstd_compress_superblock.c
    @@ -76,8 +76,8 @@ ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
         }
     
         {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
    -        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable, flags)
    -                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable, flags);
    +        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags)
    +                                          : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags);
             op += cSize;
             cLitSize += cSize;
             if (cSize == 0 || ERR_isError(cSize)) {
    @@ -102,7 +102,7 @@ ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
         switch(lhSize)
         {
         case 3: /* 2 - 2 - 10 - 10 */
    -        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
    +        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
                 MEM_writeLE24(ostart, lhc);
                 break;
             }
    @@ -122,30 +122,30 @@ ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
         }
         *entropyWritten = 1;
         DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
    -    return op-ostart;
    +    return (size_t)(op-ostart);
     }
     
     static size_t
     ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
    -                   const seqDef* sequences, size_t nbSeq,
    -                         size_t litSize, int lastSequence)
    +                   const seqDef* sequences, size_t nbSeqs,
    +                         size_t litSize, int lastSubBlock)
     {
    -    const seqDef* const sstart = sequences;
    -    const seqDef* const send = sequences + nbSeq;
    -    const seqDef* sp = sstart;
         size_t matchLengthSum = 0;
         size_t litLengthSum = 0;
    -    (void)(litLengthSum); /* suppress unused variable warning on some environments */
    -    while (send-sp > 0) {
    -        ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
    +    size_t n;
    +    for (n=0; n>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
         else
             op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
         if (nbSeq==0) {
    -        return op - ostart;
    +        return (size_t)(op - ostart);
         }
     
         /* seqHead : flags for FSE encoding type */
    @@ -209,7 +209,7 @@ ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
         }
     
         {   size_t const bitstreamSize = ZSTD_encodeSequences(
    -                                        op, oend - op,
    +                                        op, (size_t)(oend - op),
                                             fseTables->matchlengthCTable, mlCode,
                                             fseTables->offcodeCTable, ofCode,
                                             fseTables->litlengthCTable, llCode,
    @@ -253,7 +253,7 @@ ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
     #endif
     
         *entropyWritten = 1;
    -    return op - ostart;
    +    return (size_t)(op - ostart);
     }
     
     /** ZSTD_compressSubBlock() :
    @@ -279,7 +279,8 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
                     litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
         {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
                                                             &entropyMetadata->hufMetadata, literals, litSize,
    -                                                        op, oend-op, bmi2, writeLitEntropy, litEntropyWritten);
    +                                                        op, (size_t)(oend-op),
    +                                                        bmi2, writeLitEntropy, litEntropyWritten);
             FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
             if (cLitSize == 0) return 0;
             op += cLitSize;
    @@ -289,18 +290,18 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
                                                       sequences, nbSeq,
                                                       llCode, mlCode, ofCode,
                                                       cctxParams,
    -                                                  op, oend-op,
    +                                                  op, (size_t)(oend-op),
                                                       bmi2, writeSeqEntropy, seqEntropyWritten);
             FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
             if (cSeqSize == 0) return 0;
             op += cSeqSize;
         }
         /* Write block header */
    -    {   size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
    +    {   size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize;
             U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
             MEM_writeLE24(ostart, cBlockHeader24);
         }
    -    return op-ostart;
    +    return (size_t)(op-ostart);
     }
     
     static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
    @@ -389,7 +390,11 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
         return cSeqSizeEstimate + sequencesSectionHeaderSize;
     }
     
    -static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
    +typedef struct {
    +    size_t estLitSize;
    +    size_t estBlockSize;
    +} EstimatedBlockSize;
    +static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
                                             const BYTE* ofCodeTable,
                                             const BYTE* llCodeTable,
                                             const BYTE* mlCodeTable,
    @@ -397,15 +402,17 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
                                             const ZSTD_entropyCTables_t* entropy,
                                             const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
                                             void* workspace, size_t wkspSize,
    -                                        int writeLitEntropy, int writeSeqEntropy) {
    -    size_t cSizeEstimate = 0;
    -    cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
    -                                                         &entropy->huf, &entropyMetadata->hufMetadata,
    -                                                         workspace, wkspSize, writeLitEntropy);
    -    cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
    +                                        int writeLitEntropy, int writeSeqEntropy)
    +{
    +    EstimatedBlockSize ebs;
    +    ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize,
    +                                                        &entropy->huf, &entropyMetadata->hufMetadata,
    +                                                        workspace, wkspSize, writeLitEntropy);
    +    ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
                                                              nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
                                                              workspace, wkspSize, writeSeqEntropy);
    -    return cSizeEstimate + ZSTD_blockHeaderSize;
    +    ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize;
    +    return ebs;
     }
     
     static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
    @@ -419,13 +426,56 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe
         return 0;
     }
     
    +static size_t countLiterals(seqStore_t const* seqStore, const seqDef* sp, size_t seqCount)
    +{
    +    size_t n, total = 0;
    +    assert(sp != NULL);
    +    for (n=0; n %zu bytes", seqCount, (const void*)sp, total);
    +    return total;
    +}
    +
    +#define BYTESCALE 256
    +
    +static size_t sizeBlockSequences(const seqDef* sp, size_t nbSeqs,
    +                size_t targetBudget, size_t avgLitCost, size_t avgSeqCost,
    +                int firstSubBlock)
    +{
    +    size_t n, budget = 0, inSize=0;
    +    /* entropy headers */
    +    size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */
    +    assert(firstSubBlock==0 || firstSubBlock==1);
    +    budget += headerSize;
    +
    +    /* first sequence => at least one sequence*/
    +    budget += sp[0].litLength * avgLitCost + avgSeqCost;
    +    if (budget > targetBudget) return 1;
    +    inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH);
    +
    +    /* loop over sequences */
    +    for (n=1; n targetBudget)
    +            /* though continue to expand until the sub-block is deemed compressible */
    +          && (budget < inSize * BYTESCALE) )
    +            break;
    +    }
    +
    +    return n;
    +}
    +
     /** ZSTD_compressSubBlock_multi() :
      *  Breaks super-block into multiple sub-blocks and compresses them.
    - *  Entropy will be written to the first block.
    - *  The following blocks will use repeat mode to compress.
    - *  All sub-blocks are compressed blocks (no raw or rle blocks).
    - *  @return : compressed size of the super block (which is multiple ZSTD blocks)
    - *            Or 0 if it failed to compress. */
    + *  Entropy will be written into the first block.
    + *  The following blocks use repeat_mode to compress.
    + *  Sub-blocks are all compressed, except the last one when beneficial.
    + *  @return : compressed size of the super block (which features multiple ZSTD blocks)
    + *            or 0 if it failed to compress. */
     static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
                                 const ZSTD_compressedBlockState_t* prevCBlock,
                                 ZSTD_compressedBlockState_t* nextCBlock,
    @@ -438,10 +488,12 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
     {
         const seqDef* const sstart = seqStorePtr->sequencesStart;
         const seqDef* const send = seqStorePtr->sequences;
    -    const seqDef* sp = sstart;
    +    const seqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */
    +    size_t const nbSeqs = (size_t)(send - sstart);
         const BYTE* const lstart = seqStorePtr->litStart;
         const BYTE* const lend = seqStorePtr->lit;
         const BYTE* lp = lstart;
    +    size_t const nbLiterals = (size_t)(lend - lstart);
         BYTE const* ip = (BYTE const*)src;
         BYTE const* const iend = ip + srcSize;
         BYTE* const ostart = (BYTE*)dst;
    @@ -450,96 +502,152 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
         const BYTE* llCodePtr = seqStorePtr->llCode;
         const BYTE* mlCodePtr = seqStorePtr->mlCode;
         const BYTE* ofCodePtr = seqStorePtr->ofCode;
    -    size_t targetCBlockSize = cctxParams->targetCBlockSize;
    -    size_t litSize, seqCount;
    -    int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
    +    size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */
    +    size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize);
    +    int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed);
         int writeSeqEntropy = 1;
    -    int lastSequence = 0;
    -
    -    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
    -                (unsigned)(lend-lp), (unsigned)(send-sstart));
    -
    -    litSize = 0;
    -    seqCount = 0;
    -    do {
    -        size_t cBlockSizeEstimate = 0;
    -        if (sstart == send) {
    -            lastSequence = 1;
    -        } else {
    -            const seqDef* const sequence = sp + seqCount;
    -            lastSequence = sequence == send - 1;
    -            litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
    -            seqCount++;
    -        }
    -        if (lastSequence) {
    -            assert(lp <= lend);
    -            assert(litSize <= (size_t)(lend - lp));
    -            litSize = (size_t)(lend - lp);
    +
    +    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)",
    +               (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart));
    +
    +        /* let's start by a general estimation for the full block */
    +    if (nbSeqs > 0) {
    +        EstimatedBlockSize const ebs =
    +                ZSTD_estimateSubBlockSize(lp, nbLiterals,
    +                                        ofCodePtr, llCodePtr, mlCodePtr, nbSeqs,
    +                                        &nextCBlock->entropy, entropyMetadata,
    +                                        workspace, wkspSize,
    +                                        writeLitEntropy, writeSeqEntropy);
    +        /* quick estimation */
    +        size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE;
    +        size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs;
    +        const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1);
    +        size_t n, avgBlockBudget, blockBudgetSupp=0;
    +        avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks;
    +        DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes",
    +                    (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE,
    +                    (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE);
    +        /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately
    +         * this will result in the production of a single uncompressed block covering @srcSize.*/
    +        if (ebs.estBlockSize > srcSize) return 0;
    +
    +        /* compress and write sub-blocks */
    +        assert(nbSubBlocks>0);
    +        for (n=0; n < nbSubBlocks-1; n++) {
    +            /* determine nb of sequences for current sub-block + nbLiterals from next sequence */
    +            size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp),
    +                                        avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0);
    +            /* if reached last sequence : break to last sub-block (simplification) */
    +            assert(seqCount <= (size_t)(send-sp));
    +            if (sp + seqCount == send) break;
    +            assert(seqCount > 0);
    +            /* compress sub-block */
    +            {   int litEntropyWritten = 0;
    +                int seqEntropyWritten = 0;
    +                size_t litSize = countLiterals(seqStorePtr, sp, seqCount);
    +                const size_t decompressedSize =
    +                        ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0);
    +                size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
    +                                                sp, seqCount,
    +                                                lp, litSize,
    +                                                llCodePtr, mlCodePtr, ofCodePtr,
    +                                                cctxParams,
    +                                                op, (size_t)(oend-op),
    +                                                bmi2, writeLitEntropy, writeSeqEntropy,
    +                                                &litEntropyWritten, &seqEntropyWritten,
    +                                                0);
    +                FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
    +
    +                /* check compressibility, update state components */
    +                if (cSize > 0 && cSize < decompressedSize) {
    +                    DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes",
    +                                (unsigned)decompressedSize, (unsigned)cSize);
    +                    assert(ip + decompressedSize <= iend);
    +                    ip += decompressedSize;
    +                    lp += litSize;
    +                    op += cSize;
    +                    llCodePtr += seqCount;
    +                    mlCodePtr += seqCount;
    +                    ofCodePtr += seqCount;
    +                    /* Entropy only needs to be written once */
    +                    if (litEntropyWritten) {
    +                        writeLitEntropy = 0;
    +                    }
    +                    if (seqEntropyWritten) {
    +                        writeSeqEntropy = 0;
    +                    }
    +                    sp += seqCount;
    +                    blockBudgetSupp = 0;
    +            }   }
    +            /* otherwise : do not compress yet, coalesce current sub-block with following one */
             }
    -        /* I think there is an optimization opportunity here.
    -         * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
    -         * since it recalculates estimate from scratch.
    -         * For example, it would recount literal distribution and symbol codes every time.
    -         */
    -        cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
    -                                                       &nextCBlock->entropy, entropyMetadata,
    -                                                       workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
    -        if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
    -            int litEntropyWritten = 0;
    -            int seqEntropyWritten = 0;
    -            const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
    -            const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
    -                                                       sp, seqCount,
    -                                                       lp, litSize,
    -                                                       llCodePtr, mlCodePtr, ofCodePtr,
    -                                                       cctxParams,
    -                                                       op, oend-op,
    -                                                       bmi2, writeLitEntropy, writeSeqEntropy,
    -                                                       &litEntropyWritten, &seqEntropyWritten,
    -                                                       lastBlock && lastSequence);
    -            FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
    -            if (cSize > 0 && cSize < decompressedSize) {
    -                DEBUGLOG(5, "Committed the sub-block");
    -                assert(ip + decompressedSize <= iend);
    -                ip += decompressedSize;
    -                sp += seqCount;
    -                lp += litSize;
    -                op += cSize;
    -                llCodePtr += seqCount;
    -                mlCodePtr += seqCount;
    -                ofCodePtr += seqCount;
    -                litSize = 0;
    -                seqCount = 0;
    -                /* Entropy only needs to be written once */
    -                if (litEntropyWritten) {
    -                    writeLitEntropy = 0;
    -                }
    -                if (seqEntropyWritten) {
    -                    writeSeqEntropy = 0;
    -                }
    +    } /* if (nbSeqs > 0) */
    +
    +    /* write last block */
    +    DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp));
    +    {   int litEntropyWritten = 0;
    +        int seqEntropyWritten = 0;
    +        size_t litSize = (size_t)(lend - lp);
    +        size_t seqCount = (size_t)(send - sp);
    +        const size_t decompressedSize =
    +                ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1);
    +        size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
    +                                            sp, seqCount,
    +                                            lp, litSize,
    +                                            llCodePtr, mlCodePtr, ofCodePtr,
    +                                            cctxParams,
    +                                            op, (size_t)(oend-op),
    +                                            bmi2, writeLitEntropy, writeSeqEntropy,
    +                                            &litEntropyWritten, &seqEntropyWritten,
    +                                            lastBlock);
    +        FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
    +
    +        /* update pointers, the nb of literals borrowed from next sequence must be preserved */
    +        if (cSize > 0 && cSize < decompressedSize) {
    +            DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes",
    +                        (unsigned)decompressedSize, (unsigned)cSize);
    +            assert(ip + decompressedSize <= iend);
    +            ip += decompressedSize;
    +            lp += litSize;
    +            op += cSize;
    +            llCodePtr += seqCount;
    +            mlCodePtr += seqCount;
    +            ofCodePtr += seqCount;
    +            /* Entropy only needs to be written once */
    +            if (litEntropyWritten) {
    +                writeLitEntropy = 0;
    +            }
    +            if (seqEntropyWritten) {
    +                writeSeqEntropy = 0;
                 }
    +            sp += seqCount;
             }
    -    } while (!lastSequence);
    +    }
    +
    +
         if (writeLitEntropy) {
    -        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
    +        DEBUGLOG(5, "Literal entropy tables were never written");
             ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
         }
         if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
             /* If we haven't written our entropy tables, then we've violated our contract and
              * must emit an uncompressed block.
              */
    -        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
    +        DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block");
             return 0;
         }
    +
         if (ip < iend) {
    -        size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
    -        DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
    +        /* some data left : last part of the block sent uncompressed */
    +        size_t const rSize = (size_t)((iend - ip));
    +        size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock);
    +        DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize));
             FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
             assert(cSize != 0);
             op += cSize;
             /* We have to regenerate the repcodes because we've skipped some sequences */
             if (sp < send) {
    -            seqDef const* seq;
    +            const seqDef* seq;
                 repcodes_t rep;
                 ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
                 for (seq = sstart; seq < sp; ++seq) {
    @@ -548,14 +656,17 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
                 ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
             }
         }
    -    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
    -    return op-ostart;
    +
    +    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u",
    +                (unsigned)(op-ostart));
    +    return (size_t)(op-ostart);
     }
     
     size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
                                    void* dst, size_t dstCapacity,
    -                               void const* src, size_t srcSize,
    -                               unsigned lastBlock) {
    +                               const void* src, size_t srcSize,
    +                               unsigned lastBlock)
    +{
         ZSTD_entropyCTablesMetadata_t entropyMetadata;
     
         FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,
    diff --git a/third-party/zstd/lib/compress/zstd_cwksp.h b/third-party/zstd/lib/compress/zstd_cwksp.h
    index cc7fb1c7..3eddbd33 100644
    --- a/third-party/zstd/lib/compress/zstd_cwksp.h
    +++ b/third-party/zstd/lib/compress/zstd_cwksp.h
    @@ -192,6 +192,7 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
         {
             intptr_t const offset = __msan_test_shadow(ws->initOnceStart,
                 (U8*)ZSTD_cwksp_initialAllocStart(ws) - (U8*)ws->initOnceStart);
    +        (void)offset;
     #if defined(ZSTD_MSAN_PRINT)
             if(offset!=-1) {
                 __msan_print_shadow((U8*)ws->initOnceStart + offset - 8, 32);
    @@ -433,7 +434,7 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
     
     /**
      * Aligned on 64 bytes. These buffers have the special property that
    - * their values remain constrained, allowing us to re-use them without
    + * their values remain constrained, allowing us to reuse them without
      * memset()-ing them.
      */
     MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
    @@ -525,7 +526,7 @@ MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws)
         DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty");
     
     #if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
    -    /* To validate that the table re-use logic is sound, and that we don't
    +    /* To validate that the table reuse logic is sound, and that we don't
          * access table space that we haven't cleaned, we re-"poison" the table
          * space every time we mark it dirty.
          * Since tableValidEnd space and initOnce space may overlap we don't poison
    @@ -602,9 +603,9 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
         DEBUGLOG(4, "cwksp: clearing!");
     
     #if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
    -    /* To validate that the context re-use logic is sound, and that we don't
    +    /* To validate that the context reuse logic is sound, and that we don't
          * access stuff that this compression hasn't initialized, we re-"poison"
    -     * the workspace except for the areas in which we expect memory re-use
    +     * the workspace except for the areas in which we expect memory reuse
          * without initialization (objects, valid tables area and init once
          * memory). */
         {
    @@ -635,6 +636,15 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
         ZSTD_cwksp_assert_internal_consistency(ws);
     }
     
    +MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
    +    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
    +}
    +
    +MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
    +    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
    +         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
    +}
    +
     /**
      * The provided workspace takes ownership of the buffer [start, start+size).
      * Any existing values in the workspace are ignored (the previously managed
    @@ -666,6 +676,11 @@ MEM_STATIC size_t ZSTD_cwksp_create(ZSTD_cwksp* ws, size_t size, ZSTD_customMem
     MEM_STATIC void ZSTD_cwksp_free(ZSTD_cwksp* ws, ZSTD_customMem customMem) {
         void *ptr = ws->workspace;
         DEBUGLOG(4, "cwksp: freeing workspace");
    +#if ZSTD_MEMORY_SANITIZER && !defined(ZSTD_MSAN_DONT_POISON_WORKSPACE)
    +    if (ptr != NULL && customMem.customFree != NULL) {
    +        __msan_unpoison(ptr, ZSTD_cwksp_sizeof(ws));
    +    }
    +#endif
         ZSTD_memset(ws, 0, sizeof(ZSTD_cwksp));
         ZSTD_customFree(ptr, customMem);
     }
    @@ -679,15 +694,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) {
         ZSTD_memset(src, 0, sizeof(ZSTD_cwksp));
     }
     
    -MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
    -    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
    -}
    -
    -MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
    -    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
    -         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
    -}
    -
     MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
         return ws->allocFailed;
     }
    diff --git a/third-party/zstd/lib/compress/zstd_double_fast.c b/third-party/zstd/lib/compress/zstd_double_fast.c
    index 0ad88ffc..a4e9c50d 100644
    --- a/third-party/zstd/lib/compress/zstd_double_fast.c
    +++ b/third-party/zstd/lib/compress/zstd_double_fast.c
    @@ -11,7 +11,11 @@
     #include "zstd_compress_internal.h"
     #include "zstd_double_fast.h"
     
    -static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
    +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
    +
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
                                   void const* end, ZSTD_dictTableLoadMethod_e dtlm)
     {
         const ZSTD_compressionParameters* const cParams = &ms->cParams;
    @@ -47,7 +51,9 @@ static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
         }   }
     }
     
    -static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
                                   void const* end, ZSTD_dictTableLoadMethod_e dtlm)
     {
         const ZSTD_compressionParameters* const cParams = &ms->cParams;
    @@ -95,6 +101,7 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
     
     
     FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
     size_t ZSTD_compressBlock_doubleFast_noDict_generic(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize, U32 const mls /* template */)
    @@ -305,6 +312,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
     
     
     FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
     size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize,
    @@ -348,8 +356,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         if (ms->prefetchCDictTables) {
             size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
             size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
    -        PREFETCH_AREA(dictHashLong, hashTableBytes)
    -        PREFETCH_AREA(dictHashSmall, chainTableBytes)
    +        PREFETCH_AREA(dictHashLong, hashTableBytes);
    +        PREFETCH_AREA(dictHashSmall, chainTableBytes);
         }
     
         /* init */
    @@ -589,7 +597,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
     }
     
     
    -static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +size_t ZSTD_compressBlock_doubleFast_extDict_generic(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize,
             U32 const mls /* template */)
    @@ -756,3 +766,5 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
             return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
         }
     }
    +
    +#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
    diff --git a/third-party/zstd/lib/compress/zstd_double_fast.h b/third-party/zstd/lib/compress/zstd_double_fast.h
    index 6f0047c4..ce6ed8c9 100644
    --- a/third-party/zstd/lib/compress/zstd_double_fast.h
    +++ b/third-party/zstd/lib/compress/zstd_double_fast.h
    @@ -18,9 +18,12 @@ extern "C" {
     #include "../common/mem.h"      /* U32 */
     #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
     
    +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
    +
     void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
                                   void const* end, ZSTD_dictTableLoadMethod_e dtlm,
                                   ZSTD_tableFillPurpose_e tfp);
    +
     size_t ZSTD_compressBlock_doubleFast(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    @@ -31,6 +34,14 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
     
    +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast
    +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState
    +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict
    +#else
    +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL
    +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL
    +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL
    +#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
     
     #if defined (__cplusplus)
     }
    diff --git a/third-party/zstd/lib/compress/zstd_fast.c b/third-party/zstd/lib/compress/zstd_fast.c
    index 5f2c6a2e..6c4554cf 100644
    --- a/third-party/zstd/lib/compress/zstd_fast.c
    +++ b/third-party/zstd/lib/compress/zstd_fast.c
    @@ -11,7 +11,9 @@
     #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
     #include "zstd_fast.h"
     
    -static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
                             const void* const end,
                             ZSTD_dictTableLoadMethod_e dtlm)
     {
    @@ -46,7 +48,9 @@ static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
                     }   }   }   }
     }
     
    -static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
                             const void* const end,
                             ZSTD_dictTableLoadMethod_e dtlm)
     {
    @@ -139,8 +143,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
      *
      * This is also the work we do at the beginning to enter the loop initially.
      */
    -FORCE_INLINE_TEMPLATE size_t
    -ZSTD_compressBlock_fast_noDict_generic(
    +FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +size_t ZSTD_compressBlock_fast_noDict_generic(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize,
             U32 const mls, U32 const hasStep)
    @@ -456,6 +461,7 @@ size_t ZSTD_compressBlock_fast(
     }
     
     FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
     size_t ZSTD_compressBlock_fast_dictMatchState_generic(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
    @@ -502,7 +508,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     
         if (ms->prefetchCDictTables) {
             size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
    -        PREFETCH_AREA(dictHashTable, hashTableBytes)
    +        PREFETCH_AREA(dictHashTable, hashTableBytes);
         }
     
         /* init */
    @@ -681,7 +687,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState(
     }
     
     
    -static size_t ZSTD_compressBlock_fast_extDict_generic(
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +size_t ZSTD_compressBlock_fast_extDict_generic(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
     {
    diff --git a/third-party/zstd/lib/compress/zstd_lazy.c b/third-party/zstd/lib/compress/zstd_lazy.c
    index 5ba88e86..67dd55fd 100644
    --- a/third-party/zstd/lib/compress/zstd_lazy.c
    +++ b/third-party/zstd/lib/compress/zstd_lazy.c
    @@ -12,6 +12,11 @@
     #include "zstd_lazy.h"
     #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
     
    +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
    +
     #define kLazySkippingStep 8
     
     
    @@ -19,8 +24,9 @@
     *  Binary Tree search
     ***************************************/
     
    -static void
    -ZSTD_updateDUBT(ZSTD_matchState_t* ms,
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +void ZSTD_updateDUBT(ZSTD_matchState_t* ms,
                     const BYTE* ip, const BYTE* iend,
                     U32 mls)
     {
    @@ -63,8 +69,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
      *  sort one already inserted but unsorted position
      *  assumption : curr >= btlow == (curr - btmask)
      *  doesn't fail */
    -static void
    -ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
                      U32 curr, const BYTE* inputEnd,
                      U32 nbCompares, U32 btLow,
                      const ZSTD_dictMode_e dictMode)
    @@ -152,8 +159,9 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
     }
     
     
    -static size_t
    -ZSTD_DUBT_findBetterDictMatch (
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +size_t ZSTD_DUBT_findBetterDictMatch (
             const ZSTD_matchState_t* ms,
             const BYTE* const ip, const BYTE* const iend,
             size_t* offsetPtr,
    @@ -230,8 +238,9 @@ ZSTD_DUBT_findBetterDictMatch (
     }
     
     
    -static size_t
    -ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
                             const BYTE* const ip, const BYTE* const iend,
                             size_t* offBasePtr,
                             U32 const mls,
    @@ -381,8 +390,9 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
     
     
     /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
    -FORCE_INLINE_TEMPLATE size_t
    -ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
    +FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
                     const BYTE* const ip, const BYTE* const iLimit,
                           size_t* offBasePtr,
                     const U32 mls /* template */,
    @@ -617,7 +627,9 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
     
     /* Update chains up to ip (excluded)
        Assumption : always within prefix (i.e. not within extDict) */
    -FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
    +FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +U32 ZSTD_insertAndFindFirstIndex_internal(
                             ZSTD_matchState_t* ms,
                             const ZSTD_compressionParameters* const cParams,
                             const BYTE* ip, U32 const mls, U32 const lazySkipping)
    @@ -651,6 +663,7 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
     
     /* inlining is important to hardwire a hot branch (template emulation) */
     FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
     size_t ZSTD_HcFindBestMatch(
                             ZSTD_matchState_t* ms,
                             const BYTE* const ip, const BYTE* const iLimit,
    @@ -819,7 +832,9 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* t
      * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
      * but not beyond iLimit.
      */
    -FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
    +FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
                                        U32 const rowLog, U32 const mls,
                                        U32 idx, const BYTE* const iLimit)
     {
    @@ -845,7 +860,9 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
      * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
      * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
      */
    -FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
    +FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
                                                       BYTE const* tagTable, BYTE const* base,
                                                       U32 idx, U32 const hashLog,
                                                       U32 const rowLog, U32 const mls,
    @@ -863,10 +880,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab
     /* ZSTD_row_update_internalImpl():
      * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
      */
    -FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
    -                                                        U32 updateStartIdx, U32 const updateEndIdx,
    -                                                        U32 const mls, U32 const rowLog,
    -                                                        U32 const rowMask, U32 const useCache)
    +FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
    +                                  U32 updateStartIdx, U32 const updateEndIdx,
    +                                  U32 const mls, U32 const rowLog,
    +                                  U32 const rowMask, U32 const useCache)
     {
         U32* const hashTable = ms->hashTable;
         BYTE* const tagTable = ms->tagTable;
    @@ -892,9 +911,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
      * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
      * Skips sections of long matches as is necessary.
      */
    -FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
    -                                                    U32 const mls, U32 const rowLog,
    -                                                    U32 const rowMask, U32 const useCache)
    +FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
    +                              U32 const mls, U32 const rowLog,
    +                              U32 const rowMask, U32 const useCache)
     {
         U32 idx = ms->nextToUpdate;
         const BYTE* const base = ms->window.base;
    @@ -1102,20 +1123,21 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGr
     
     /* The high-level approach of the SIMD row based match finder is as follows:
      * - Figure out where to insert the new entry:
    - *      - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
    - *      - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
    + *      - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index.
    + *           - The hash is salted by a value that changes on every contex reset, so when the same table is used
    + *             we will avoid collisions that would otherwise slow us down by intorducing phantom matches.
    + *      - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines
      *        which row to insert into.
    - *      - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
    - *        be considered as a circular buffer with a "head" index that resides in the tagTable.
    - *      - Also insert the "tag" into the equivalent row and position in the tagTable.
    - *          - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
    - *                  The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
    - *                  for alignment/performance reasons, leaving some bytes unused.
    - * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
    + *      - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can
    + *        be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes
    + *        per row).
    + * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and
      *   generate a bitfield that we can cycle through to check the collisions in the hash table.
      * - Pick the longest match.
    + * - Insert the tag into the equivalent row and position in the tagTable.
      */
     FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
     size_t ZSTD_RowFindBestMatch(
                             ZSTD_matchState_t* ms,
                             const BYTE* const ip, const BYTE* const iLimit,
    @@ -1489,8 +1511,9 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
     *  Common parser - lazy strategy
     *********************************/
     
    -FORCE_INLINE_TEMPLATE size_t
    -ZSTD_compressBlock_lazy_generic(
    +FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +size_t ZSTD_compressBlock_lazy_generic(
                             ZSTD_matchState_t* ms, seqStore_t* seqStore,
                             U32 rep[ZSTD_REP_NUM],
                             const void* src, size_t srcSize,
    @@ -1754,152 +1777,163 @@ ZSTD_compressBlock_lazy_generic(
         /* Return the last literals size */
         return (size_t)(iend - anchor);
     }
    +#endif /* build exclusions */
     
     
    -size_t ZSTD_compressBlock_btlazy2(
    +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
    +size_t ZSTD_compressBlock_greedy(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
     }
     
    -size_t ZSTD_compressBlock_lazy2(
    +size_t ZSTD_compressBlock_greedy_dictMatchState(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
     }
     
    -size_t ZSTD_compressBlock_lazy(
    +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
     }
     
    -size_t ZSTD_compressBlock_greedy(
    +size_t ZSTD_compressBlock_greedy_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
     }
     
    -size_t ZSTD_compressBlock_btlazy2_dictMatchState(
    +size_t ZSTD_compressBlock_greedy_dictMatchState_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
     }
     
    -size_t ZSTD_compressBlock_lazy2_dictMatchState(
    +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
     }
    +#endif
     
    -size_t ZSTD_compressBlock_lazy_dictMatchState(
    +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
    +size_t ZSTD_compressBlock_lazy(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
     }
     
    -size_t ZSTD_compressBlock_greedy_dictMatchState(
    +size_t ZSTD_compressBlock_lazy_dictMatchState(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
     }
     
    -
    -size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
    +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
     }
     
    -size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
    +size_t ZSTD_compressBlock_lazy_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
     }
     
    -size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
    +size_t ZSTD_compressBlock_lazy_dictMatchState_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
     }
     
    -/* Row-based matchfinder */
    -size_t ZSTD_compressBlock_lazy2_row(
    +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
     }
    +#endif
     
    -size_t ZSTD_compressBlock_lazy_row(
    +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
    +size_t ZSTD_compressBlock_lazy2(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
     }
     
    -size_t ZSTD_compressBlock_greedy_row(
    +size_t ZSTD_compressBlock_lazy2_dictMatchState(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
     }
     
    -size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
    +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
     }
     
    -size_t ZSTD_compressBlock_lazy_dictMatchState_row(
    +size_t ZSTD_compressBlock_lazy2_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
     }
     
    -size_t ZSTD_compressBlock_greedy_dictMatchState_row(
    +size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
     }
     
    -
     size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
         return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
     }
    +#endif
     
    -size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
    +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
    +size_t ZSTD_compressBlock_btlazy2(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
     }
     
    -size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
    +size_t ZSTD_compressBlock_btlazy2_dictMatchState(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
    +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
     }
    +#endif
     
    +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
     FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
     size_t ZSTD_compressBlock_lazy_extDict_generic(
                             ZSTD_matchState_t* ms, seqStore_t* seqStore,
                             U32 rep[ZSTD_REP_NUM],
    @@ -2101,8 +2135,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
         /* Return the last literals size */
         return (size_t)(iend - anchor);
     }
    +#endif /* build exclusions */
     
    -
    +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
     size_t ZSTD_compressBlock_greedy_extDict(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
    @@ -2110,48 +2145,55 @@ size_t ZSTD_compressBlock_greedy_extDict(
         return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
     }
     
    -size_t ZSTD_compressBlock_lazy_extDict(
    +size_t ZSTD_compressBlock_greedy_extDict_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
    -
     {
    -    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
    +    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
     }
    +#endif
     
    -size_t ZSTD_compressBlock_lazy2_extDict(
    +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
    +size_t ZSTD_compressBlock_lazy_extDict(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     
     {
    -    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
    +    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
     }
     
    -size_t ZSTD_compressBlock_btlazy2_extDict(
    +size_t ZSTD_compressBlock_lazy_extDict_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
     
     {
    -    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
    +    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
     }
    +#endif
     
    -size_t ZSTD_compressBlock_greedy_extDict_row(
    +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
    +size_t ZSTD_compressBlock_lazy2_extDict(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
    +
     {
    -    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
    +    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
     }
     
    -size_t ZSTD_compressBlock_lazy_extDict_row(
    +size_t ZSTD_compressBlock_lazy2_extDict_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
    -
     {
    -    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
    +    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
     }
    +#endif
     
    -size_t ZSTD_compressBlock_lazy2_extDict_row(
    +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
    +size_t ZSTD_compressBlock_btlazy2_extDict(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize)
    +
     {
    -    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
    +    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
     }
    +#endif
    diff --git a/third-party/zstd/lib/compress/zstd_lazy.h b/third-party/zstd/lib/compress/zstd_lazy.h
    index 3bde6733..3635813b 100644
    --- a/third-party/zstd/lib/compress/zstd_lazy.h
    +++ b/third-party/zstd/lib/compress/zstd_lazy.h
    @@ -27,98 +27,173 @@ extern "C" {
     
     #define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
     
    +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
     U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
     void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
     
     void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip);
     
     void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
    +#endif
     
    -size_t ZSTD_compressBlock_btlazy2(
    +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
    +size_t ZSTD_compressBlock_greedy(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy2(
    +size_t ZSTD_compressBlock_greedy_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy(
    +size_t ZSTD_compressBlock_greedy_dictMatchState(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_greedy(
    +size_t ZSTD_compressBlock_greedy_dictMatchState_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy2_row(
    +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy_row(
    +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_greedy_row(
    +size_t ZSTD_compressBlock_greedy_extDict(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -
    -size_t ZSTD_compressBlock_btlazy2_dictMatchState(
    +size_t ZSTD_compressBlock_greedy_extDict_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy2_dictMatchState(
    +
    +#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy
    +#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row
    +#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState
    +#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row
    +#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch
    +#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row
    +#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict
    +#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row
    +#else
    +#define ZSTD_COMPRESSBLOCK_GREEDY NULL
    +#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL
    +#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL
    +#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL
    +#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL
    +#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL
    +#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL
    +#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL
    +#endif
    +
    +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
    +size_t ZSTD_compressBlock_lazy(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy_dictMatchState(
    +size_t ZSTD_compressBlock_lazy_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_greedy_dictMatchState(
    +size_t ZSTD_compressBlock_lazy_dictMatchState(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
    +size_t ZSTD_compressBlock_lazy_dictMatchState_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy_dictMatchState_row(
    +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_greedy_dictMatchState_row(
    +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -
    -size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
    +size_t ZSTD_compressBlock_lazy_extDict(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
    +size_t ZSTD_compressBlock_lazy_extDict_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
    +
    +#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy
    +#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row
    +#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState
    +#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row
    +#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch
    +#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row
    +#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict
    +#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row
    +#else
    +#define ZSTD_COMPRESSBLOCK_LAZY NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL
    +#endif
    +
    +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
    +size_t ZSTD_compressBlock_lazy2(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
    +size_t ZSTD_compressBlock_lazy2_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
    +size_t ZSTD_compressBlock_lazy2_dictMatchState(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
    +size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -
    -size_t ZSTD_compressBlock_greedy_extDict(
    +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy_extDict(
    +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
     size_t ZSTD_compressBlock_lazy2_extDict(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_greedy_extDict_row(
    +size_t ZSTD_compressBlock_lazy2_extDict_row(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy_extDict_row(
    +
    +#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2
    +#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row
    +#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState
    +#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row
    +#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch
    +#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row
    +#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict
    +#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row
    +#else
    +#define ZSTD_COMPRESSBLOCK_LAZY2 NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL
    +#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL
    +#endif
    +
    +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
    +size_t ZSTD_compressBlock_btlazy2(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_lazy2_extDict_row(
    +size_t ZSTD_compressBlock_btlazy2_dictMatchState(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
     size_t ZSTD_compressBlock_btlazy2_extDict(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
     
    +#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2
    +#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState
    +#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict
    +#else
    +#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL
    +#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL
    +#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL
    +#endif
    +
     
     #if defined (__cplusplus)
     }
    diff --git a/third-party/zstd/lib/compress/zstd_ldm.c b/third-party/zstd/lib/compress/zstd_ldm.c
    index 3d74ff19..17c069fe 100644
    --- a/third-party/zstd/lib/compress/zstd_ldm.c
    +++ b/third-party/zstd/lib/compress/zstd_ldm.c
    @@ -246,7 +246,11 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
             break;
     
         case ZSTD_dfast:
    +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
             ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
    +#else
    +        assert(0); /* shouldn't be called: cparams should've been adjusted. */
    +#endif
             break;
     
         case ZSTD_greedy:
    @@ -318,7 +322,9 @@ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
         }
     }
     
    -static size_t ZSTD_ldm_generateSequences_internal(
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +size_t ZSTD_ldm_generateSequences_internal(
             ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
             ldmParams_t const* params, void const* src, size_t srcSize)
     {
    @@ -689,7 +695,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
             /* maybeSplitSequence updates rawSeqStore->pos */
             rawSeq const sequence = maybeSplitSequence(rawSeqStore,
                                                        (U32)(iend - ip), minMatch);
    -        int i;
             /* End signal */
             if (sequence.offset == 0)
                 break;
    @@ -702,6 +707,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
             /* Run the block compressor */
             DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength);
             {
    +            int i;
                 size_t const newLitLength =
                     blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
                 ip += sequence.litLength;
    diff --git a/third-party/zstd/lib/compress/zstd_opt.c b/third-party/zstd/lib/compress/zstd_opt.c
    index f02a7609..e63073e5 100644
    --- a/third-party/zstd/lib/compress/zstd_opt.c
    +++ b/third-party/zstd/lib/compress/zstd_opt.c
    @@ -12,6 +12,9 @@
     #include "hist.h"
     #include "zstd_opt.h"
     
    +#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
     
     #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
     #define ZSTD_MAX_PRICE     (1<<30)
    @@ -264,6 +267,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
                                     const optState_t* const optPtr,
                                     int optLevel)
     {
    +    DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength);
         if (litLength == 0) return 0;
     
         if (!ZSTD_compressedLiterals(optPtr))
    @@ -402,9 +406,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
     
     /* Update hashTable3 up to ip (excluded)
        Assumption : always within prefix (i.e. not within extDict) */
    -static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
    -                                              U32* nextToUpdate3,
    -                                              const BYTE* const ip)
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
    +                                       U32* nextToUpdate3,
    +                                       const BYTE* const ip)
     {
         U32* const hashTable3 = ms->hashTable3;
         U32 const hashLog3 = ms->hashLog3;
    @@ -431,7 +437,9 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
      * @param ip assumed <= iend-8 .
      * @param target The target of ZSTD_updateTree_internal() - we are filling to this position
      * @return : nb of positions added */
    -static U32 ZSTD_insertBt1(
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +U32 ZSTD_insertBt1(
                     const ZSTD_matchState_t* ms,
                     const BYTE* const ip, const BYTE* const iend,
                     U32 const target,
    @@ -550,6 +558,7 @@ static U32 ZSTD_insertBt1(
     }
     
     FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
     void ZSTD_updateTree_internal(
                     ZSTD_matchState_t* ms,
                     const BYTE* const ip, const BYTE* const iend,
    @@ -558,7 +567,7 @@ void ZSTD_updateTree_internal(
         const BYTE* const base = ms->window.base;
         U32 const target = (U32)(ip - base);
         U32 idx = ms->nextToUpdate;
    -    DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
    +    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
                     idx, target, dictMode);
     
         while(idx < target) {
    @@ -575,7 +584,9 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
         ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
     }
     
    -FORCE_INLINE_TEMPLATE U32
    +FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +U32
     ZSTD_insertBtAndGetAllMatches (
                     ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
                     ZSTD_matchState_t* ms,
    @@ -816,7 +827,9 @@ typedef U32 (*ZSTD_getAllMatchesFn)(
         U32 const ll0,
         U32 const lengthToBeat);
     
    -FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
    +FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +U32 ZSTD_btGetAllMatches_internal(
             ZSTD_match_t* matches,
             ZSTD_matchState_t* ms,
             U32* nextToUpdate3,
    @@ -1035,11 +1048,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
     *  Optimal parser
     *********************************/
     
    -static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
    -{
    -    return sol.litlen + sol.mlen;
    -}
    -
     #if 0 /* debug */
     
     static void
    @@ -1057,7 +1065,13 @@ listStats(const U32* table, int lastEltID)
     
     #endif
     
    -FORCE_INLINE_TEMPLATE size_t
    +#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel)
    +#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel)
    +#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1))
    +
    +FORCE_INLINE_TEMPLATE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +size_t
     ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                                    seqStore_t* seqStore,
                                    U32 rep[ZSTD_REP_NUM],
    @@ -1083,10 +1097,10 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
     
         ZSTD_optimal_t* const opt = optStatePtr->priceTable;
         ZSTD_match_t* const matches = optStatePtr->matchTable;
    -    ZSTD_optimal_t lastSequence;
    +    ZSTD_optimal_t lastStretch;
         ZSTD_optLdm_t optLdm;
     
    -    ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t));
    +    ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t));
     
         optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
         optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
    @@ -1108,19 +1122,31 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                 U32 const ll0 = !litlen;
                 U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
                 ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
    -                                              (U32)(ip-istart), (U32)(iend - ip));
    -            if (!nbMatches) { ip++; continue; }
    +                                              (U32)(ip-istart), (U32)(iend-ip));
    +            if (!nbMatches) {
    +                DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart));
    +                ip++;
    +                continue;
    +            }
    +
    +            /* Match found: let's store this solution, and eventually find more candidates.
    +             * During this forward pass, @opt is used to store stretches,
    +             * defined as "a match followed by N literals".
    +             * Note how this is different from a Sequence, which is "N literals followed by a match".
    +             * Storing stretches allows us to store different match predecessors
    +             * for each literal position part of a literals run. */
     
                 /* initialize opt[0] */
    -            { U32 i ; for (i=0; i immediate encoding */
                 {   U32 const maxML = matches[nbMatches-1].len;
    @@ -1129,82 +1155,106 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                                 nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
     
                     if (maxML > sufficient_len) {
    -                    lastSequence.litlen = litlen;
    -                    lastSequence.mlen = maxML;
    -                    lastSequence.off = maxOffBase;
    -                    DEBUGLOG(6, "large match (%u>%u), immediate encoding",
    +                    lastStretch.litlen = 0;
    +                    lastStretch.mlen = maxML;
    +                    lastStretch.off = maxOffBase;
    +                    DEBUGLOG(6, "large match (%u>%u) => immediate encoding",
                                     maxML, sufficient_len);
                         cur = 0;
    -                    last_pos = ZSTD_totalLen(lastSequence);
    +                    last_pos = maxML;
                         goto _shortestPath;
                 }   }
     
                 /* set prices for first matches starting position == 0 */
                 assert(opt[0].price >= 0);
    -            {   U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
    -                U32 pos;
    +            {   U32 pos;
                     U32 matchNb;
                     for (pos = 1; pos < minMatch; pos++) {
    -                    opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
    +                    opt[pos].price = ZSTD_MAX_PRICE;
    +                    opt[pos].mlen = 0;
    +                    opt[pos].litlen = litlen + pos;
                     }
                     for (matchNb = 0; matchNb < nbMatches; matchNb++) {
                         U32 const offBase = matches[matchNb].off;
                         U32 const end = matches[matchNb].len;
                         for ( ; pos <= end ; pos++ ) {
    -                        U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
    -                        U32 const sequencePrice = literalsPrice + matchPrice;
    +                        int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
    +                        int const sequencePrice = opt[0].price + matchPrice;
                             DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
    -                                    pos, ZSTD_fCost((int)sequencePrice));
    +                                    pos, ZSTD_fCost(sequencePrice));
                             opt[pos].mlen = pos;
                             opt[pos].off = offBase;
    -                        opt[pos].litlen = litlen;
    -                        opt[pos].price = (int)sequencePrice;
    -                }   }
    +                        opt[pos].litlen = 0; /* end of match */
    +                        opt[pos].price = sequencePrice + LL_PRICE(0);
    +                    }
    +                }
                     last_pos = pos-1;
    +                opt[pos].price = ZSTD_MAX_PRICE;
                 }
             }
     
             /* check further positions */
             for (cur = 1; cur <= last_pos; cur++) {
                 const BYTE* const inr = ip + cur;
    -            assert(cur < ZSTD_OPT_NUM);
    -            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
    +            assert(cur <= ZSTD_OPT_NUM);
    +            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur);
     
                 /* Fix current position with one literal if cheaper */
    -            {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
    +            {   U32 const litlen = opt[cur-1].litlen + 1;
                     int const price = opt[cur-1].price
    -                                + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
    -                                + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
    -                                - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
    +                                + LIT_PRICE(ip+cur-1)
    +                                + LL_INCPRICE(litlen);
                     assert(price < 1000000000); /* overflow check */
                     if (price <= opt[cur].price) {
    +                    ZSTD_optimal_t const prevMatch = opt[cur];
                         DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
                                     inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
                                     opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
    -                    opt[cur].mlen = 0;
    -                    opt[cur].off = 0;
    +                    opt[cur] = opt[cur-1];
                         opt[cur].litlen = litlen;
                         opt[cur].price = price;
    +                    if ( (optLevel >= 1) /* additional check only for higher modes */
    +                      && (prevMatch.litlen == 0) /* replace a match */
    +                      && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */
    +                      && LIKELY(ip + cur < iend)
    +                    ) {
    +                        /* check next position, in case it would be cheaper */
    +                        int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1);
    +                        int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1);
    +                        DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f",
    +                                cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals));
    +                        if ( (with1literal < withMoreLiterals)
    +                          && (with1literal < opt[cur+1].price) ) {
    +                            /* update offset history - before it disappears */
    +                            U32 const prev = cur - prevMatch.mlen;
    +                            repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0);
    +                            assert(cur >= prevMatch.mlen);
    +                            DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !",
    +                                        ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals),
    +                                        newReps.rep[0], newReps.rep[1], newReps.rep[2] );
    +                            opt[cur+1] = prevMatch;  /* mlen & offbase */
    +                            ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(repcodes_t));
    +                            opt[cur+1].litlen = 1;
    +                            opt[cur+1].price = with1literal;
    +                            if (last_pos < cur+1) last_pos = cur+1;
    +                        }
    +                    }
                     } else {
    -                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
    -                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
    -                                opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
    +                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f)",
    +                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price));
                     }
                 }
     
    -            /* Set the repcodes of the current position. We must do it here
    -             * because we rely on the repcodes of the 2nd to last sequence being
    -             * correct to set the next chunks repcodes during the backward
    -             * traversal.
    +            /* Offset history is not updated during match comparison.
    +             * Do it here, now that the match is selected and confirmed.
                  */
                 ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
                 assert(cur >= opt[cur].mlen);
    -            if (opt[cur].mlen != 0) {
    +            if (opt[cur].litlen == 0) {
    +                /* just finished a match => alter offset history */
                     U32 const prev = cur - opt[cur].mlen;
    -                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
    +                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0);
                     ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
    -            } else {
    -                ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
                 }
     
                 /* last match must start at a minimum distance of 8 from oend */
    @@ -1214,15 +1264,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
     
                 if ( (optLevel==0) /*static_test*/
                   && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
    -                DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
    +                DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1);
                     continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
                 }
     
                 assert(opt[cur].price >= 0);
    -            {   U32 const ll0 = (opt[cur].mlen != 0);
    -                U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
    -                U32 const previousPrice = (U32)opt[cur].price;
    -                U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
    +            {   U32 const ll0 = (opt[cur].litlen == 0);
    +                int const previousPrice = opt[cur].price;
    +                int const basePrice = previousPrice + LL_PRICE(0);
                     U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
                     U32 matchNb;
     
    @@ -1234,18 +1283,17 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                         continue;
                     }
     
    -                {   U32 const maxML = matches[nbMatches-1].len;
    -                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
    -                                inr-istart, cur, nbMatches, maxML);
    -
    -                    if ( (maxML > sufficient_len)
    -                      || (cur + maxML >= ZSTD_OPT_NUM) ) {
    -                        lastSequence.mlen = maxML;
    -                        lastSequence.off = matches[nbMatches-1].off;
    -                        lastSequence.litlen = litlen;
    -                        cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0;  /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
    -                        last_pos = cur + ZSTD_totalLen(lastSequence);
    -                        if (cur > ZSTD_OPT_NUM) cur = 0;   /* underflow => first match */
    +                {   U32 const longestML = matches[nbMatches-1].len;
    +                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of longest ML=%u",
    +                                inr-istart, cur, nbMatches, longestML);
    +
    +                    if ( (longestML > sufficient_len)
    +                      || (cur + longestML >= ZSTD_OPT_NUM)
    +                      || (ip + cur + longestML >= iend) ) {
    +                        lastStretch.mlen = longestML;
    +                        lastStretch.off = matches[nbMatches-1].off;
    +                        lastStretch.litlen = 0;
    +                        last_pos = cur + longestML;
                             goto _shortestPath;
                     }   }
     
    @@ -1257,19 +1305,24 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                         U32 mlen;
     
                         DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
    -                                matchNb, matches[matchNb].off, lastML, litlen);
    +                                matchNb, matches[matchNb].off, lastML, opt[cur].litlen);
     
                         for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
                             U32 const pos = cur + mlen;
    -                        int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
    +                        int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
     
                             if ((pos > last_pos) || (price < opt[pos].price)) {
                                 DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
                                             pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
    -                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }   /* fill empty positions */
    +                            while (last_pos < pos) {
    +                                /* fill empty positions, for future comparisons */
    +                                last_pos++;
    +                                opt[last_pos].price = ZSTD_MAX_PRICE;
    +                                opt[last_pos].litlen = !0;  /* just needs to be != 0, to mean "not an end of match" */
    +                            }
                                 opt[pos].mlen = mlen;
                                 opt[pos].off = offset;
    -                            opt[pos].litlen = litlen;
    +                            opt[pos].litlen = 0;
                                 opt[pos].price = price;
                             } else {
                                 DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
    @@ -1277,47 +1330,81 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                                 if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
                             }
                 }   }   }
    +            opt[last_pos+1].price = ZSTD_MAX_PRICE;
             }  /* for (cur = 1; cur <= last_pos; cur++) */
     
    -        lastSequence = opt[last_pos];
    -        cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0;  /* single sequence, and it starts before `ip` */
    -        assert(cur < ZSTD_OPT_NUM);  /* control overflow*/
    +        lastStretch = opt[last_pos];
    +        assert(cur >= lastStretch.mlen);
    +        cur = last_pos - lastStretch.mlen;
     
     _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
             assert(opt[0].mlen == 0);
    +        assert(last_pos >= lastStretch.mlen);
    +        assert(cur == last_pos - lastStretch.mlen);
     
    -        /* Set the next chunk's repcodes based on the repcodes of the beginning
    -         * of the last match, and the last sequence. This avoids us having to
    -         * update them while traversing the sequences.
    -         */
    -        if (lastSequence.mlen != 0) {
    -            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
    -            ZSTD_memcpy(rep, &reps, sizeof(reps));
    +        if (lastStretch.mlen==0) {
    +            /* no solution : all matches have been converted into literals */
    +            assert(lastStretch.litlen == (ip - anchor) + last_pos);
    +            ip += last_pos;
    +            continue;
    +        }
    +        assert(lastStretch.off > 0);
    +
    +        /* Update offset history */
    +        if (lastStretch.litlen == 0) {
    +            /* finishing on a match : update offset history */
    +            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0);
    +            ZSTD_memcpy(rep, &reps, sizeof(repcodes_t));
             } else {
    -            ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
    +            ZSTD_memcpy(rep, lastStretch.rep, sizeof(repcodes_t));
    +            assert(cur >= lastStretch.litlen);
    +            cur -= lastStretch.litlen;
             }
     
    -        {   U32 const storeEnd = cur + 1;
    +        /* Let's write the shortest path solution.
    +         * It is stored in @opt in reverse order,
    +         * starting from @storeEnd (==cur+2),
    +         * effectively partially @opt overwriting.
    +         * Content is changed too:
    +         * - So far, @opt stored stretches, aka a match followed by literals
    +         * - Now, it will store sequences, aka literals followed by a match
    +         */
    +        {   U32 const storeEnd = cur + 2;
                 U32 storeStart = storeEnd;
    -            U32 seqPos = cur;
    +            U32 stretchPos = cur;
     
                 DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
                             last_pos, cur); (void)last_pos;
    -            assert(storeEnd < ZSTD_OPT_NUM);
    -            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
    -                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
    -            opt[storeEnd] = lastSequence;
    -            while (seqPos > 0) {
    -                U32 const backDist = ZSTD_totalLen(opt[seqPos]);
    +            assert(storeEnd < ZSTD_OPT_SIZE);
    +            DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
    +                        storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off);
    +            if (lastStretch.litlen > 0) {
    +                /* last "sequence" is unfinished: just a bunch of literals */
    +                opt[storeEnd].litlen = lastStretch.litlen;
    +                opt[storeEnd].mlen = 0;
    +                storeStart = storeEnd-1;
    +                opt[storeStart] = lastStretch;
    +            } {
    +                opt[storeEnd] = lastStretch;  /* note: litlen will be fixed */
    +                storeStart = storeEnd;
    +            }
    +            while (1) {
    +                ZSTD_optimal_t nextStretch = opt[stretchPos];
    +                opt[storeStart].litlen = nextStretch.litlen;
    +                DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)",
    +                            opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off);
    +                if (nextStretch.mlen == 0) {
    +                    /* reaching beginning of segment */
    +                    break;
    +                }
                     storeStart--;
    -                DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
    -                            seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
    -                opt[storeStart] = opt[seqPos];
    -                seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
    +                opt[storeStart] = nextStretch; /* note: litlen will be fixed */
    +                assert(nextStretch.litlen + nextStretch.mlen <= stretchPos);
    +                stretchPos -= nextStretch.litlen + nextStretch.mlen;
                 }
     
                 /* save sequences */
    -            DEBUGLOG(6, "sending selected sequences into seqStore")
    +            DEBUGLOG(6, "sending selected sequences into seqStore");
                 {   U32 storePos;
                     for (storePos=storeStart; storePos <= storeEnd; storePos++) {
                         U32 const llen = opt[storePos].litlen;
    @@ -1339,6 +1426,9 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                         anchor += advance;
                         ip = anchor;
                 }   }
    +            DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]);
    +
    +            /* update all costs */
                 ZSTD_setBasePrices(optStatePtr, optLevel);
             }
         }   /* while (ip < ilimit) */
    @@ -1346,21 +1436,27 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
         /* Return the last literals size */
         return (size_t)(iend - anchor);
     }
    +#endif /* build exclusions */
     
    +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
     static size_t ZSTD_compressBlock_opt0(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
     {
         return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
     }
    +#endif
     
    +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
     static size_t ZSTD_compressBlock_opt2(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
     {
         return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
     }
    +#endif
     
    +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
     size_t ZSTD_compressBlock_btopt(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             const void* src, size_t srcSize)
    @@ -1368,20 +1464,23 @@ size_t ZSTD_compressBlock_btopt(
         DEBUGLOG(5, "ZSTD_compressBlock_btopt");
         return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
     }
    +#endif
     
     
     
     
    +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
     /* ZSTD_initStats_ultra():
      * make a first compression pass, just to seed stats with more accurate starting values.
      * only works on first block, with no dictionary and no ldm.
      * this function cannot error out, its narrow contract must be respected.
      */
    -static void
    -ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
    -                     seqStore_t* seqStore,
    -                     U32 rep[ZSTD_REP_NUM],
    -               const void* src, size_t srcSize)
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +void ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
    +                          seqStore_t* seqStore,
    +                          U32 rep[ZSTD_REP_NUM],
    +                    const void* src, size_t srcSize)
     {
         U32 tmpRep[ZSTD_REP_NUM];  /* updated rep codes will sink here */
         ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep));
    @@ -1425,7 +1524,7 @@ size_t ZSTD_compressBlock_btultra2(
          * Consequently, this can only work if no data has been previously loaded in tables,
          * aka, no dictionary, no prefix, no ldm preprocessing.
          * The compression ratio gain is generally small (~0.5% on first block),
    -    ** the cost is 2x cpu time on first block. */
    +     * the cost is 2x cpu time on first block. */
         assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
         if ( (ms->opt.litLengthSum==0)   /* first block */
           && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
    @@ -1438,7 +1537,9 @@ size_t ZSTD_compressBlock_btultra2(
     
         return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
     }
    +#endif
     
    +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
     size_t ZSTD_compressBlock_btopt_dictMatchState(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             const void* src, size_t srcSize)
    @@ -1446,18 +1547,20 @@ size_t ZSTD_compressBlock_btopt_dictMatchState(
         return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
     }
     
    -size_t ZSTD_compressBlock_btultra_dictMatchState(
    +size_t ZSTD_compressBlock_btopt_extDict(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             const void* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
    +    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
     }
    +#endif
     
    -size_t ZSTD_compressBlock_btopt_extDict(
    +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
    +size_t ZSTD_compressBlock_btultra_dictMatchState(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             const void* src, size_t srcSize)
     {
    -    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
    +    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
     }
     
     size_t ZSTD_compressBlock_btultra_extDict(
    @@ -1466,6 +1569,7 @@ size_t ZSTD_compressBlock_btultra_extDict(
     {
         return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
     }
    +#endif
     
     /* note : no btultra2 variant for extDict nor dictMatchState,
      * because btultra2 is not meant to work with dictionaries
    diff --git a/third-party/zstd/lib/compress/zstd_opt.h b/third-party/zstd/lib/compress/zstd_opt.h
    index 342e5a31..d4e71131 100644
    --- a/third-party/zstd/lib/compress/zstd_opt.h
    +++ b/third-party/zstd/lib/compress/zstd_opt.h
    @@ -17,30 +17,40 @@ extern "C" {
     
     #include "zstd_compress_internal.h"
     
    +#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
    + || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
     /* used in ZSTD_loadDictionaryContent() */
     void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
    +#endif
     
    +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
     size_t ZSTD_compressBlock_btopt(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_btultra(
    +size_t ZSTD_compressBlock_btopt_dictMatchState(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -size_t ZSTD_compressBlock_btultra2(
    +size_t ZSTD_compressBlock_btopt_extDict(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
     
    +#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt
    +#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState
    +#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict
    +#else
    +#define ZSTD_COMPRESSBLOCK_BTOPT NULL
    +#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL
    +#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL
    +#endif
     
    -size_t ZSTD_compressBlock_btopt_dictMatchState(
    +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
    +size_t ZSTD_compressBlock_btultra(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
     size_t ZSTD_compressBlock_btultra_dictMatchState(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    -
    -size_t ZSTD_compressBlock_btopt_extDict(
    -        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
    -        void const* src, size_t srcSize);
     size_t ZSTD_compressBlock_btultra_extDict(
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
             void const* src, size_t srcSize);
    @@ -48,6 +58,20 @@ size_t ZSTD_compressBlock_btultra_extDict(
             /* note : no btultra2 variant for extDict nor dictMatchState,
              * because btultra2 is not meant to work with dictionaries
              * and is only specific for the first block (no prefix) */
    +size_t ZSTD_compressBlock_btultra2(
    +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
    +        void const* src, size_t srcSize);
    +
    +#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra
    +#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState
    +#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict
    +#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2
    +#else
    +#define ZSTD_COMPRESSBLOCK_BTULTRA NULL
    +#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL
    +#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL
    +#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL
    +#endif
     
     #if defined (__cplusplus)
     }
    diff --git a/third-party/zstd/lib/compress/zstdmt_compress.c b/third-party/zstd/lib/compress/zstdmt_compress.c
    index 67860755..86ccce31 100644
    --- a/third-party/zstd/lib/compress/zstdmt_compress.c
    +++ b/third-party/zstd/lib/compress/zstdmt_compress.c
    @@ -15,17 +15,13 @@
     #endif
     
     
    -/* ======   Constants   ====== */
    -#define ZSTDMT_OVERLAPLOG_DEFAULT 0
    -
    -
     /* ======   Dependencies   ====== */
    -#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
    +#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
     #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memset, INT_MAX, UINT_MAX */
     #include "../common/mem.h"         /* MEM_STATIC */
     #include "../common/pool.h"        /* threadpool */
     #include "../common/threading.h"   /* mutex */
    -#include "zstd_compress_internal.h"  /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */
    +#include "zstd_compress_internal.h" /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */
     #include "zstd_ldm.h"
     #include "zstdmt_compress.h"
     
    @@ -44,12 +40,13 @@
     #  include 
     #  include 
     
    -#  define DEBUG_PRINTHEX(l,p,n) {            \
    -    unsigned debug_u;                        \
    -    for (debug_u=0; debug_u<(n); debug_u++)  \
    -        RAWLOG(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \
    -    RAWLOG(l, " \n");                        \
    -}
    +#  define DEBUG_PRINTHEX(l,p,n)                                       \
    +    do {                                                              \
    +        unsigned debug_u;                                             \
    +        for (debug_u=0; debug_u<(n); debug_u++)                       \
    +            RAWLOG(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \
    +        RAWLOG(l, " \n");                                             \
    +    } while (0)
     
     static unsigned long long GetCurrentClockTimeMicroseconds(void)
     {
    @@ -61,25 +58,28 @@ static unsigned long long GetCurrentClockTimeMicroseconds(void)
     }  }
     
     #define MUTEX_WAIT_TIME_DLEVEL 6
    -#define ZSTD_PTHREAD_MUTEX_LOCK(mutex) {          \
    -    if (DEBUGLEVEL >= MUTEX_WAIT_TIME_DLEVEL) {   \
    -        unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds(); \
    -        ZSTD_pthread_mutex_lock(mutex);           \
    -        {   unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \
    -            unsigned long long const elapsedTime = (afterTime-beforeTime); \
    -            if (elapsedTime > 1000) {  /* or whatever threshold you like; I'm using 1 millisecond here */ \
    -                DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, "Thread took %llu microseconds to acquire mutex %s \n", \
    -                   elapsedTime, #mutex);          \
    -        }   }                                     \
    -    } else {                                      \
    -        ZSTD_pthread_mutex_lock(mutex);           \
    -    }                                             \
    -}
    +#define ZSTD_PTHREAD_MUTEX_LOCK(mutex)                                                  \
    +    do {                                                                                \
    +        if (DEBUGLEVEL >= MUTEX_WAIT_TIME_DLEVEL) {                                     \
    +            unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds();    \
    +            ZSTD_pthread_mutex_lock(mutex);                                             \
    +            {   unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \
    +                unsigned long long const elapsedTime = (afterTime-beforeTime);          \
    +                if (elapsedTime > 1000) {                                               \
    +                    /* or whatever threshold you like; I'm using 1 millisecond here */  \
    +                    DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL,                                    \
    +                        "Thread took %llu microseconds to acquire mutex %s \n",         \
    +                        elapsedTime, #mutex);                                           \
    +            }   }                                                                       \
    +        } else {                                                                        \
    +            ZSTD_pthread_mutex_lock(mutex);                                             \
    +        }                                                                               \
    +    } while (0)
     
     #else
     
     #  define ZSTD_PTHREAD_MUTEX_LOCK(m) ZSTD_pthread_mutex_lock(m)
    -#  define DEBUG_PRINTHEX(l,p,n) {}
    +#  define DEBUG_PRINTHEX(l,p,n) do { } while (0)
     
     #endif
     
    @@ -100,18 +100,39 @@ typedef struct ZSTDMT_bufferPool_s {
         unsigned totalBuffers;
         unsigned nbBuffers;
         ZSTD_customMem cMem;
    -    buffer_t bTable[1];   /* variable size */
    +    buffer_t* buffers;
     } ZSTDMT_bufferPool;
     
    +static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool)
    +{
    +    DEBUGLOG(3, "ZSTDMT_freeBufferPool (address:%08X)", (U32)(size_t)bufPool);
    +    if (!bufPool) return;   /* compatibility with free on NULL */
    +    if (bufPool->buffers) {
    +        unsigned u;
    +        for (u=0; utotalBuffers; u++) {
    +            DEBUGLOG(4, "free buffer %2u (address:%08X)", u, (U32)(size_t)bufPool->buffers[u].start);
    +            ZSTD_customFree(bufPool->buffers[u].start, bufPool->cMem);
    +        }
    +        ZSTD_customFree(bufPool->buffers, bufPool->cMem);
    +    }
    +    ZSTD_pthread_mutex_destroy(&bufPool->poolMutex);
    +    ZSTD_customFree(bufPool, bufPool->cMem);
    +}
    +
     static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_customMem cMem)
     {
    -    ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)ZSTD_customCalloc(
    -        sizeof(ZSTDMT_bufferPool) + (maxNbBuffers-1) * sizeof(buffer_t), cMem);
    +    ZSTDMT_bufferPool* const bufPool =
    +        (ZSTDMT_bufferPool*)ZSTD_customCalloc(sizeof(ZSTDMT_bufferPool), cMem);
         if (bufPool==NULL) return NULL;
         if (ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) {
             ZSTD_customFree(bufPool, cMem);
             return NULL;
         }
    +    bufPool->buffers = (buffer_t*)ZSTD_customCalloc(maxNbBuffers * sizeof(buffer_t), cMem);
    +    if (bufPool->buffers==NULL) {
    +        ZSTDMT_freeBufferPool(bufPool);
    +        return NULL;
    +    }
         bufPool->bufferSize = 64 KB;
         bufPool->totalBuffers = maxNbBuffers;
         bufPool->nbBuffers = 0;
    @@ -119,32 +140,19 @@ static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_cu
         return bufPool;
     }
     
    -static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool)
    -{
    -    unsigned u;
    -    DEBUGLOG(3, "ZSTDMT_freeBufferPool (address:%08X)", (U32)(size_t)bufPool);
    -    if (!bufPool) return;   /* compatibility with free on NULL */
    -    for (u=0; utotalBuffers; u++) {
    -        DEBUGLOG(4, "free buffer %2u (address:%08X)", u, (U32)(size_t)bufPool->bTable[u].start);
    -        ZSTD_customFree(bufPool->bTable[u].start, bufPool->cMem);
    -    }
    -    ZSTD_pthread_mutex_destroy(&bufPool->poolMutex);
    -    ZSTD_customFree(bufPool, bufPool->cMem);
    -}
    -
     /* only works at initialization, not during compression */
     static size_t ZSTDMT_sizeof_bufferPool(ZSTDMT_bufferPool* bufPool)
     {
    -    size_t const poolSize = sizeof(*bufPool)
    -                          + (bufPool->totalBuffers - 1) * sizeof(buffer_t);
    +    size_t const poolSize = sizeof(*bufPool);
    +    size_t const arraySize = bufPool->totalBuffers * sizeof(buffer_t);
         unsigned u;
         size_t totalBufferSize = 0;
         ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
         for (u=0; utotalBuffers; u++)
    -        totalBufferSize += bufPool->bTable[u].capacity;
    +        totalBufferSize += bufPool->buffers[u].capacity;
         ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
     
    -    return poolSize + totalBufferSize;
    +    return poolSize + arraySize + totalBufferSize;
     }
     
     /* ZSTDMT_setBufferSize() :
    @@ -187,9 +195,9 @@ static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* bufPool)
         DEBUGLOG(5, "ZSTDMT_getBuffer: bSize = %u", (U32)bufPool->bufferSize);
         ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
         if (bufPool->nbBuffers) {   /* try to use an existing buffer */
    -        buffer_t const buf = bufPool->bTable[--(bufPool->nbBuffers)];
    +        buffer_t const buf = bufPool->buffers[--(bufPool->nbBuffers)];
             size_t const availBufferSize = buf.capacity;
    -        bufPool->bTable[bufPool->nbBuffers] = g_nullBuffer;
    +        bufPool->buffers[bufPool->nbBuffers] = g_nullBuffer;
             if ((availBufferSize >= bSize) & ((availBufferSize>>3) <= bSize)) {
                 /* large enough, but not too much */
                 DEBUGLOG(5, "ZSTDMT_getBuffer: provide buffer %u of size %u",
    @@ -250,14 +258,14 @@ static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buf)
         if (buf.start == NULL) return;   /* compatible with release on NULL */
         ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
         if (bufPool->nbBuffers < bufPool->totalBuffers) {
    -        bufPool->bTable[bufPool->nbBuffers++] = buf;  /* stored for later use */
    +        bufPool->buffers[bufPool->nbBuffers++] = buf;  /* stored for later use */
             DEBUGLOG(5, "ZSTDMT_releaseBuffer: stored buffer of size %u in slot %u",
                         (U32)buf.capacity, (U32)(bufPool->nbBuffers-1));
             ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
             return;
         }
         ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
    -    /* Reached bufferPool capacity (should not happen) */
    +    /* Reached bufferPool capacity (note: should not happen) */
         DEBUGLOG(5, "ZSTDMT_releaseBuffer: pool capacity reached => freeing ");
         ZSTD_customFree(buf.start, bufPool->cMem);
     }
    @@ -350,16 +358,20 @@ typedef struct {
         int totalCCtx;
         int availCCtx;
         ZSTD_customMem cMem;
    -    ZSTD_CCtx* cctx[1];   /* variable size */
    +    ZSTD_CCtx** cctxs;
     } ZSTDMT_CCtxPool;
     
    -/* note : all CCtx borrowed from the pool should be released back to the pool _before_ freeing the pool */
    +/* note : all CCtx borrowed from the pool must be reverted back to the pool _before_ freeing the pool */
     static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool)
     {
    -    int cid;
    -    for (cid=0; cidtotalCCtx; cid++)
    -        ZSTD_freeCCtx(pool->cctx[cid]);  /* note : compatible with free on NULL */
    +    if (!pool) return;
         ZSTD_pthread_mutex_destroy(&pool->poolMutex);
    +    if (pool->cctxs) {
    +        int cid;
    +        for (cid=0; cidtotalCCtx; cid++)
    +            ZSTD_freeCCtx(pool->cctxs[cid]);  /* free compatible with NULL */
    +        ZSTD_customFree(pool->cctxs, pool->cMem);
    +    }
         ZSTD_customFree(pool, pool->cMem);
     }
     
    @@ -368,19 +380,24 @@ static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool)
     static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(int nbWorkers,
                                                   ZSTD_customMem cMem)
     {
    -    ZSTDMT_CCtxPool* const cctxPool = (ZSTDMT_CCtxPool*) ZSTD_customCalloc(
    -        sizeof(ZSTDMT_CCtxPool) + (nbWorkers-1)*sizeof(ZSTD_CCtx*), cMem);
    +    ZSTDMT_CCtxPool* const cctxPool =
    +        (ZSTDMT_CCtxPool*) ZSTD_customCalloc(sizeof(ZSTDMT_CCtxPool), cMem);
         assert(nbWorkers > 0);
         if (!cctxPool) return NULL;
         if (ZSTD_pthread_mutex_init(&cctxPool->poolMutex, NULL)) {
             ZSTD_customFree(cctxPool, cMem);
             return NULL;
         }
    -    cctxPool->cMem = cMem;
         cctxPool->totalCCtx = nbWorkers;
    +    cctxPool->cctxs = (ZSTD_CCtx**)ZSTD_customCalloc(nbWorkers * sizeof(ZSTD_CCtx*), cMem);
    +    if (!cctxPool->cctxs) {
    +        ZSTDMT_freeCCtxPool(cctxPool);
    +        return NULL;
    +    }
    +    cctxPool->cMem = cMem;
    +    cctxPool->cctxs[0] = ZSTD_createCCtx_advanced(cMem);
    +    if (!cctxPool->cctxs[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; }
         cctxPool->availCCtx = 1;   /* at least one cctx for single-thread mode */
    -    cctxPool->cctx[0] = ZSTD_createCCtx_advanced(cMem);
    -    if (!cctxPool->cctx[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; }
         DEBUGLOG(3, "cctxPool created, with %u workers", nbWorkers);
         return cctxPool;
     }
    @@ -402,16 +419,16 @@ static size_t ZSTDMT_sizeof_CCtxPool(ZSTDMT_CCtxPool* cctxPool)
     {
         ZSTD_pthread_mutex_lock(&cctxPool->poolMutex);
         {   unsigned const nbWorkers = cctxPool->totalCCtx;
    -        size_t const poolSize = sizeof(*cctxPool)
    -                                + (nbWorkers-1) * sizeof(ZSTD_CCtx*);
    -        unsigned u;
    +        size_t const poolSize = sizeof(*cctxPool);
    +        size_t const arraySize = cctxPool->totalCCtx * sizeof(ZSTD_CCtx*);
             size_t totalCCtxSize = 0;
    +        unsigned u;
             for (u=0; ucctx[u]);
    +            totalCCtxSize += ZSTD_sizeof_CCtx(cctxPool->cctxs[u]);
             }
             ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
             assert(nbWorkers > 0);
    -        return poolSize + totalCCtxSize;
    +        return poolSize + arraySize + totalCCtxSize;
         }
     }
     
    @@ -421,7 +438,7 @@ static ZSTD_CCtx* ZSTDMT_getCCtx(ZSTDMT_CCtxPool* cctxPool)
         ZSTD_pthread_mutex_lock(&cctxPool->poolMutex);
         if (cctxPool->availCCtx) {
             cctxPool->availCCtx--;
    -        {   ZSTD_CCtx* const cctx = cctxPool->cctx[cctxPool->availCCtx];
    +        {   ZSTD_CCtx* const cctx = cctxPool->cctxs[cctxPool->availCCtx];
                 ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex);
                 return cctx;
         }   }
    @@ -435,7 +452,7 @@ static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx)
         if (cctx==NULL) return;   /* compatibility with release on NULL */
         ZSTD_pthread_mutex_lock(&pool->poolMutex);
         if (pool->availCCtx < pool->totalCCtx)
    -        pool->cctx[pool->availCCtx++] = cctx;
    +        pool->cctxs[pool->availCCtx++] = cctx;
         else {
             /* pool overflow : should not happen, since totalCCtx==nbWorkers */
             DEBUGLOG(4, "CCtx pool overflow : free cctx");
    @@ -601,11 +618,8 @@ static void ZSTDMT_serialState_update(serialState_t* serialState,
         ZSTD_pthread_mutex_unlock(&serialState->mutex);
     
         if (seqStore.size > 0) {
    -        size_t const err = ZSTD_referenceExternalSequences(
    -            jobCCtx, seqStore.seq, seqStore.size);
    +        ZSTD_referenceExternalSequences(jobCCtx, seqStore.seq, seqStore.size);
             assert(serialState->params.ldmParams.enableLdm == ZSTD_ps_enable);
    -        assert(!ZSTD_isError(err));
    -        (void)err;
         }
     }
     
    @@ -657,12 +671,13 @@ typedef struct {
         unsigned frameChecksumNeeded;        /* used only by mtctx */
     } ZSTDMT_jobDescription;
     
    -#define JOB_ERROR(e) {                          \
    -    ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);   \
    -    job->cSize = e;                             \
    -    ZSTD_pthread_mutex_unlock(&job->job_mutex); \
    -    goto _endJob;                               \
    -}
    +#define JOB_ERROR(e)                                \
    +    do {                                            \
    +        ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);   \
    +        job->cSize = e;                             \
    +        ZSTD_pthread_mutex_unlock(&job->job_mutex); \
    +        goto _endJob;                               \
    +    } while (0)
     
     /* ZSTDMT_compressionJob() is a POOL_function type */
     static void ZSTDMT_compressionJob(void* jobDescription)
    @@ -1091,7 +1106,7 @@ ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx)
         {   unsigned jobNb;
             unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady; assert(mtctx->jobReady <= 1);
             DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)",
    -                    mtctx->doneJobID, lastJobNb, mtctx->jobReady)
    +                    mtctx->doneJobID, lastJobNb, mtctx->jobReady);
             for (jobNb = mtctx->doneJobID ; jobNb < lastJobNb ; jobNb++) {
                 unsigned const wJobID = jobNb & mtctx->jobIDMask;
                 ZSTDMT_jobDescription* jobPtr = &mtctx->jobs[wJobID];
    diff --git a/third-party/zstd/lib/decompress/huf_decompress.c b/third-party/zstd/lib/decompress/huf_decompress.c
    index 5b217ac5..f85dd0be 100644
    --- a/third-party/zstd/lib/decompress/huf_decompress.c
    +++ b/third-party/zstd/lib/decompress/huf_decompress.c
    @@ -34,6 +34,12 @@
     *  Macros
     ****************************************************************/
     
    +#ifdef HUF_DISABLE_FAST_DECODE
    +# define HUF_ENABLE_FAST_DECODE 0
    +#else
    +# define HUF_ENABLE_FAST_DECODE 1
    +#endif
    +
     /* These two optional macros force the use one way or another of the two
      * Huffman decompression implementations. You can't force in both directions
      * at the same time.
    @@ -158,17 +164,18 @@ static size_t HUF_initFastDStream(BYTE const* ip) {
      * op [in/out] - The output pointers, must be updated to reflect what is written.
      * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
      * dt [in] - The decoding table.
    - * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
    + * ilowest [in] - The beginning of the valid range of the input. Decoders may read
    + *                down to this pointer. It may be below iend[0].
      * oend [in] - The end of the output stream. op[3] must not cross oend.
      * iend [in] - The end of each input stream. ip[i] may cross iend[i],
    - *             as long as it is above ilimit, but that indicates corruption.
    + *             as long as it is above ilowest, but that indicates corruption.
      */
     typedef struct {
         BYTE const* ip[4];
         BYTE* op[4];
         U64 bits[4];
         void const* dt;
    -    BYTE const* ilimit;
    +    BYTE const* ilowest;
         BYTE* oend;
         BYTE const* iend[4];
     } HUF_DecompressFastArgs;
    @@ -186,9 +193,9 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
         void const* dt = DTable + 1;
         U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
     
    -    const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
    +    const BYTE* const istart = (const BYTE*)src;
     
    -    BYTE* const oend = (BYTE*)dst + dstSize;
    +    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
     
         /* The fast decoding loop assumes 64-bit little-endian.
          * This condition is false on x32.
    @@ -196,6 +203,11 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
         if (!MEM_isLittleEndian() || MEM_32bits())
             return 0;
     
    +    /* Avoid nullptr addition */
    +    if (dstSize == 0)
    +        return 0;
    +    assert(dst != NULL);
    +
         /* strict minimum : jump table + 1 byte per stream */
         if (srcSize < 10)
             return ERROR(corruption_detected);
    @@ -209,7 +221,6 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
     
         /* Read the jump table. */
         {
    -        const BYTE* const istart = (const BYTE*)src;
             size_t const length1 = MEM_readLE16(istart);
             size_t const length2 = MEM_readLE16(istart+2);
             size_t const length3 = MEM_readLE16(istart+4);
    @@ -221,10 +232,8 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
     
             /* HUF_initFastDStream() requires this, and this small of an input
              * won't benefit from the ASM loop anyways.
    -         * length1 must be >= 16 so that ip[0] >= ilimit before the loop
    -         * starts.
              */
    -        if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
    +        if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
                 return 0;
             if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
         }
    @@ -256,11 +265,12 @@ static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* ds
         args->bits[2] = HUF_initFastDStream(args->ip[2]);
         args->bits[3] = HUF_initFastDStream(args->ip[3]);
     
    -    /* If ip[] >= ilimit, it is guaranteed to be safe to
    -        * reload bits[]. It may be beyond its section, but is
    -        * guaranteed to be valid (>= istart).
    -        */
    -    args->ilimit = ilimit;
    +    /* The decoders must be sure to never read beyond ilowest.
    +     * This is lower than iend[0], but allowing decoders to read
    +     * down to ilowest can allow an extra iteration or two in the
    +     * fast loop.
    +     */
    +    args->ilowest = istart;
     
         args->oend = oend;
         args->dt = dt;
    @@ -285,13 +295,31 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArg
         assert(sizeof(size_t) == 8);
         bit->bitContainer = MEM_readLEST(args->ip[stream]);
         bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
    -    bit->start = (const char*)args->iend[0];
    +    bit->start = (const char*)args->ilowest;
         bit->limitPtr = bit->start + sizeof(size_t);
         bit->ptr = (const char*)args->ip[stream];
     
         return 0;
     }
     
    +/* Calls X(N) for each stream 0, 1, 2, 3. */
    +#define HUF_4X_FOR_EACH_STREAM(X) \
    +    do {                          \
    +        X(0);                     \
    +        X(1);                     \
    +        X(2);                     \
    +        X(3);                     \
    +    } while (0)
    +
    +/* Calls X(N, var) for each stream 0, 1, 2, 3. */
    +#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
    +    do {                                        \
    +        X(0, (var));                            \
    +        X(1, (var));                            \
    +        X(2, (var));                            \
    +        X(3, (var));                            \
    +    } while (0)
    +
     
     #ifndef HUF_FORCE_DECOMPRESS_X2
     
    @@ -500,15 +528,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog
     }
     
     #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
    -    *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
    +    do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
     
    -#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)  \
    -    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
    -        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
    +#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)      \
    +    do {                                            \
    +        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
    +            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
    +    } while (0)
     
    -#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
    -    if (MEM_64bits()) \
    -        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
    +#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr)      \
    +    do {                                            \
    +        if (MEM_64bits())                           \
    +            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
    +    } while (0)
     
     HINT_INLINE size_t
     HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
    @@ -546,7 +578,7 @@ HUF_decompress1X1_usingDTable_internal_body(
         const HUF_DTable* DTable)
     {
         BYTE* op = (BYTE*)dst;
    -    BYTE* const oend = op + dstSize;
    +    BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
         const void* dtPtr = DTable + 1;
         const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
         BIT_DStream_t bitD;
    @@ -574,6 +606,7 @@ HUF_decompress4X1_usingDTable_internal_body(
     {
         /* Check */
         if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
    +    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
     
         {   const BYTE* const istart = (const BYTE*) cSrc;
             BYTE* const ostart = (BYTE*) dst;
    @@ -609,7 +642,7 @@ HUF_decompress4X1_usingDTable_internal_body(
     
             if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
             if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
    -        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
    +        assert(dstSize >= 6); /* validated above */
             CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
             CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
             CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
    @@ -692,7 +725,7 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
         BYTE* op[4];
         U16 const* const dtable = (U16 const*)args->dt;
         BYTE* const oend = args->oend;
    -    BYTE const* const ilimit = args->ilimit;
    +    BYTE const* const ilowest = args->ilowest;
     
         /* Copy the arguments to local variables */
         ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
    @@ -705,13 +738,12 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
         for (;;) {
             BYTE* olimit;
             int stream;
    -        int symbol;
     
             /* Assert loop preconditions */
     #ifndef NDEBUG
             for (stream = 0; stream < 4; ++stream) {
                 assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
    -            assert(ip[stream] >= ilimit);
    +            assert(ip[stream] >= ilowest);
             }
     #endif
             /* Compute olimit */
    @@ -721,7 +753,7 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
                 /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
                  * per stream.
                  */
    -            size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
    +            size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
                 /* We can safely run iters iterations before running bounds checks */
                 size_t const iters = MIN(oiters, iiters);
                 size_t const symbols = iters * 5;
    @@ -732,8 +764,8 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
                  */
                 olimit = op[3] + symbols;
     
    -            /* Exit fast decoding loop once we get close to the end. */
    -            if (op[3] + 20 > olimit)
    +            /* Exit fast decoding loop once we reach the end. */
    +            if (op[3] == olimit)
                     break;
     
                 /* Exit the decoding loop if any input pointer has crossed the
    @@ -752,27 +784,42 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
             }
     #endif
     
    +#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol)                 \
    +    do {                                                        \
    +        int const index = (int)(bits[(_stream)] >> 53);         \
    +        int const entry = (int)dtable[index];                   \
    +        bits[(_stream)] <<= (entry & 0x3F);                     \
    +        op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
    +    } while (0)
    +
    +#define HUF_4X1_RELOAD_STREAM(_stream)                              \
    +    do {                                                            \
    +        int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
    +        int const nbBits = ctz & 7;                                 \
    +        int const nbBytes = ctz >> 3;                               \
    +        op[(_stream)] += 5;                                         \
    +        ip[(_stream)] -= nbBytes;                                   \
    +        bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
    +        bits[(_stream)] <<= nbBits;                                 \
    +    } while (0)
    +
    +        /* Manually unroll the loop because compilers don't consistently
    +         * unroll the inner loops, which destroys performance.
    +         */
             do {
                 /* Decode 5 symbols in each of the 4 streams */
    -            for (symbol = 0; symbol < 5; ++symbol) {
    -                for (stream = 0; stream < 4; ++stream) {
    -                    int const index = (int)(bits[stream] >> 53);
    -                    int const entry = (int)dtable[index];
    -                    bits[stream] <<= (entry & 63);
    -                    op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
    -                }
    -            }
    -            /* Reload the bitstreams */
    -            for (stream = 0; stream < 4; ++stream) {
    -                int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
    -                int const nbBits = ctz & 7;
    -                int const nbBytes = ctz >> 3;
    -                op[stream] += 5;
    -                ip[stream] -= nbBytes;
    -                bits[stream] = MEM_read64(ip[stream]) | 1;
    -                bits[stream] <<= nbBits;
    -            }
    +            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
    +            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
    +            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
    +            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
    +            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
    +
    +            /* Reload each of the 4 the bitstreams */
    +            HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
             } while (op[3] < olimit);
    +
    +#undef HUF_4X1_DECODE_SYMBOL
    +#undef HUF_4X1_RELOAD_STREAM
         }
     
     _out:
    @@ -797,8 +844,8 @@ HUF_decompress4X1_usingDTable_internal_fast(
         HUF_DecompressFastLoopFn loopFn)
     {
         void const* dt = DTable + 1;
    -    const BYTE* const iend = (const BYTE*)cSrc + 6;
    -    BYTE* const oend = (BYTE*)dst + dstSize;
    +    BYTE const* const ilowest = (BYTE const*)cSrc;
    +    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
         HUF_DecompressFastArgs args;
         {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
             FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
    @@ -806,18 +853,22 @@ HUF_decompress4X1_usingDTable_internal_fast(
                 return 0;
         }
     
    -    assert(args.ip[0] >= args.ilimit);
    +    assert(args.ip[0] >= args.ilowest);
         loopFn(&args);
     
    -    /* Our loop guarantees that ip[] >= ilimit and that we haven't
    +    /* Our loop guarantees that ip[] >= ilowest and that we haven't
         * overwritten any op[].
         */
    -    assert(args.ip[0] >= iend);
    -    assert(args.ip[1] >= iend);
    -    assert(args.ip[2] >= iend);
    -    assert(args.ip[3] >= iend);
    +    assert(args.ip[0] >= ilowest);
    +    assert(args.ip[0] >= ilowest);
    +    assert(args.ip[1] >= ilowest);
    +    assert(args.ip[2] >= ilowest);
    +    assert(args.ip[3] >= ilowest);
         assert(args.op[3] <= oend);
    -    (void)iend;
    +
    +    assert(ilowest == args.ilowest);
    +    assert(ilowest + 6 == args.iend[0]);
    +    (void)ilowest;
     
         /* finish bit streams one by one. */
         {   size_t const segmentSize = (dstSize+3) / 4;
    @@ -868,7 +919,7 @@ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize,
         }
     #endif
     
    -    if (!(flags & HUF_flags_disableFast)) {
    +    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
             size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
             if (ret != 0)
                 return ret;
    @@ -1239,15 +1290,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c
     }
     
     #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
    -    ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
    +    do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
     
    -#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
    -    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
    -        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
    +#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)                     \
    +    do {                                                           \
    +        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12))                \
    +            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
    +    } while (0)
     
    -#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
    -    if (MEM_64bits()) \
    -        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
    +#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr)                     \
    +    do {                                                           \
    +        if (MEM_64bits())                                          \
    +            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
    +    } while (0)
     
     HINT_INLINE size_t
     HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
    @@ -1307,7 +1362,7 @@ HUF_decompress1X2_usingDTable_internal_body(
     
         /* decode */
         {   BYTE* const ostart = (BYTE*) dst;
    -        BYTE* const oend = ostart + dstSize;
    +        BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
             const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
             const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
             DTableDesc const dtd = HUF_getDTableDesc(DTable);
    @@ -1332,6 +1387,7 @@ HUF_decompress4X2_usingDTable_internal_body(
         const HUF_DTable* DTable)
     {
         if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
    +    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
     
         {   const BYTE* const istart = (const BYTE*) cSrc;
             BYTE* const ostart = (BYTE*) dst;
    @@ -1367,7 +1423,7 @@ HUF_decompress4X2_usingDTable_internal_body(
     
             if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
             if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
    -        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
    +        assert(dstSize >= 6 /* validated above */);
             CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
             CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
             CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
    @@ -1472,7 +1528,7 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
         BYTE* op[4];
         BYTE* oend[4];
         HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
    -    BYTE const* const ilimit = args->ilimit;
    +    BYTE const* const ilowest = args->ilowest;
     
         /* Copy the arguments to local registers. */
         ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
    @@ -1490,13 +1546,12 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
         for (;;) {
             BYTE* olimit;
             int stream;
    -        int symbol;
     
             /* Assert loop preconditions */
     #ifndef NDEBUG
             for (stream = 0; stream < 4; ++stream) {
                 assert(op[stream] <= oend[stream]);
    -            assert(ip[stream] >= ilimit);
    +            assert(ip[stream] >= ilowest);
             }
     #endif
             /* Compute olimit */
    @@ -1509,7 +1564,7 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
                  * We also know that each input pointer is >= ip[0]. So we can run
                  * iters loops before running out of input.
                  */
    -            size_t iters = (size_t)(ip[0] - ilimit) / 7;
    +            size_t iters = (size_t)(ip[0] - ilowest) / 7;
                 /* Each iteration can produce up to 10 bytes of output per stream.
                  * Each output stream my advance at different rates. So take the
                  * minimum number of safe iterations among all the output streams.
    @@ -1527,8 +1582,8 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
                  */
                 olimit = op[3] + (iters * 5);
     
    -            /* Exit the fast decoding loop if we are too close to the end. */
    -            if (op[3] + 10 > olimit)
    +            /* Exit the fast decoding loop once we reach the end. */
    +            if (op[3] == olimit)
                     break;
     
                 /* Exit the decoding loop if any input pointer has crossed the
    @@ -1547,54 +1602,58 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
             }
     #endif
     
    +#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)                      \
    +    do {                                                              \
    +        if ((_decode3) || (_stream) != 3) {                           \
    +            int const index = (int)(bits[(_stream)] >> 53);           \
    +            HUF_DEltX2 const entry = dtable[index];                   \
    +            MEM_write16(op[(_stream)], entry.sequence); \
    +            bits[(_stream)] <<= (entry.nbBits) & 0x3F;                \
    +            op[(_stream)] += (entry.length);                          \
    +        }                                                             \
    +    } while (0)
    +
    +#define HUF_4X2_RELOAD_STREAM(_stream)                                  \
    +    do {                                                                \
    +        HUF_4X2_DECODE_SYMBOL(3, 1);                                    \
    +        {                                                               \
    +            int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
    +            int const nbBits = ctz & 7;                                 \
    +            int const nbBytes = ctz >> 3;                               \
    +            ip[(_stream)] -= nbBytes;                                   \
    +            bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
    +            bits[(_stream)] <<= nbBits;                                 \
    +        }                                                               \
    +    } while (0)
    +
    +        /* Manually unroll the loop because compilers don't consistently
    +         * unroll the inner loops, which destroys performance.
    +         */
             do {
    -            /* Do 5 table lookups for each of the first 3 streams */
    -            for (symbol = 0; symbol < 5; ++symbol) {
    -                for (stream = 0; stream < 3; ++stream) {
    -                    int const index = (int)(bits[stream] >> 53);
    -                    HUF_DEltX2 const entry = dtable[index];
    -                    MEM_write16(op[stream], entry.sequence);
    -                    bits[stream] <<= (entry.nbBits);
    -                    op[stream] += (entry.length);
    -                }
    -            }
    -            /* Do 1 table lookup from the final stream */
    -            {
    -                int const index = (int)(bits[3] >> 53);
    -                HUF_DEltX2 const entry = dtable[index];
    -                MEM_write16(op[3], entry.sequence);
    -                bits[3] <<= (entry.nbBits);
    -                op[3] += (entry.length);
    -            }
    -            /* Do 4 table lookups from the final stream & reload bitstreams */
    -            for (stream = 0; stream < 4; ++stream) {
    -                /* Do a table lookup from the final stream.
    -                 * This is interleaved with the reloading to reduce register
    -                 * pressure. This shouldn't be necessary, but compilers can
    -                 * struggle with codegen with high register pressure.
    -                 */
    -                {
    -                    int const index = (int)(bits[3] >> 53);
    -                    HUF_DEltX2 const entry = dtable[index];
    -                    MEM_write16(op[3], entry.sequence);
    -                    bits[3] <<= (entry.nbBits);
    -                    op[3] += (entry.length);
    -                }
    -                /* Reload the bistreams. The final bitstream must be reloaded
    -                 * after the 5th symbol was decoded.
    -                 */
    -                {
    -                    int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
    -                    int const nbBits = ctz & 7;
    -                    int const nbBytes = ctz >> 3;
    -                    ip[stream] -= nbBytes;
    -                    bits[stream] = MEM_read64(ip[stream]) | 1;
    -                    bits[stream] <<= nbBits;
    -                }
    -            }
    +            /* Decode 5 symbols from each of the first 3 streams.
    +             * The final stream will be decoded during the reload phase
    +             * to reduce register pressure.
    +             */
    +            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
    +            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
    +            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
    +            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
    +            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
    +
    +            /* Decode one symbol from the final stream */
    +            HUF_4X2_DECODE_SYMBOL(3, 1);
    +
    +            /* Decode 4 symbols from the final stream & reload bitstreams.
    +             * The final stream is reloaded last, meaning that all 5 symbols
    +             * are decoded from the final stream before it is reloaded.
    +             */
    +            HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
             } while (op[3] < olimit);
         }
     
    +#undef HUF_4X2_DECODE_SYMBOL
    +#undef HUF_4X2_RELOAD_STREAM
    +
     _out:
     
         /* Save the final values of each of the state variables back to args. */
    @@ -1611,8 +1670,8 @@ HUF_decompress4X2_usingDTable_internal_fast(
         const HUF_DTable* DTable,
         HUF_DecompressFastLoopFn loopFn) {
         void const* dt = DTable + 1;
    -    const BYTE* const iend = (const BYTE*)cSrc + 6;
    -    BYTE* const oend = (BYTE*)dst + dstSize;
    +    const BYTE* const ilowest = (const BYTE*)cSrc;
    +    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
         HUF_DecompressFastArgs args;
         {
             size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
    @@ -1621,16 +1680,19 @@ HUF_decompress4X2_usingDTable_internal_fast(
                 return 0;
         }
     
    -    assert(args.ip[0] >= args.ilimit);
    +    assert(args.ip[0] >= args.ilowest);
         loopFn(&args);
     
         /* note : op4 already verified within main loop */
    -    assert(args.ip[0] >= iend);
    -    assert(args.ip[1] >= iend);
    -    assert(args.ip[2] >= iend);
    -    assert(args.ip[3] >= iend);
    +    assert(args.ip[0] >= ilowest);
    +    assert(args.ip[1] >= ilowest);
    +    assert(args.ip[2] >= ilowest);
    +    assert(args.ip[3] >= ilowest);
         assert(args.op[3] <= oend);
    -    (void)iend;
    +
    +    assert(ilowest == args.ilowest);
    +    assert(ilowest + 6 == args.iend[0]);
    +    (void)ilowest;
     
         /* finish bitStreams one by one */
         {
    @@ -1679,7 +1741,7 @@ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize,
         }
     #endif
     
    -    if (!(flags & HUF_flags_disableFast)) {
    +    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
             size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
             if (ret != 0)
                 return ret;
    diff --git a/third-party/zstd/lib/decompress/huf_decompress_amd64.S b/third-party/zstd/lib/decompress/huf_decompress_amd64.S
    index 671624fe..78da291e 100644
    --- a/third-party/zstd/lib/decompress/huf_decompress_amd64.S
    +++ b/third-party/zstd/lib/decompress/huf_decompress_amd64.S
    @@ -10,11 +10,32 @@
     
     #include "../common/portability_macros.h"
     
    +#if defined(__ELF__) && defined(__GNUC__)
     /* Stack marking
      * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
      */
    -#if defined(__ELF__) && defined(__GNUC__)
     .section .note.GNU-stack,"",%progbits
    +
    +#if defined(__aarch64__)
    +/* Mark that this assembly supports BTI & PAC, because it is empty for aarch64.
    + * See: https://github.com/facebook/zstd/issues/3841
    + * See: https://gcc.godbolt.org/z/sqr5T4ffK
    + * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/
    + * See: https://reviews.llvm.org/D62609
    + */
    +.pushsection .note.gnu.property, "a"
    +.p2align 3
    +.long 4                 /* size of the name - "GNU\0" */
    +.long 0x10              /* size of descriptor */
    +.long 0x5               /* NT_GNU_PROPERTY_TYPE_0 */
    +.asciz "GNU"
    +.long 0xc0000000        /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */
    +.long 4                 /* pr_datasz - 4 bytes */
    +.long 3                 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */
    +.p2align 3              /* pr_padding - bring everything to 8 byte alignment */
    +.popsection
    +#endif
    +
     #endif
     
     #if ZSTD_ENABLE_ASM_X86_64_BMI2
    @@ -131,7 +152,7 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
         movq 88(%rax), %bits3
         movq 96(%rax), %dtable
         push %rax      /* argument */
    -    push 104(%rax) /* ilimit */
    +    push 104(%rax) /* ilowest */
         push 112(%rax) /* oend */
         push %olimit   /* olimit space */
     
    @@ -156,11 +177,11 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
         shrq $2, %r15
     
         movq %ip0,     %rax /* rax = ip0 */
    -    movq 40(%rsp), %rdx /* rdx = ilimit */
    -    subq %rdx,     %rax /* rax = ip0 - ilimit */
    -    movq %rax,     %rbx /* rbx = ip0 - ilimit */
    +    movq 40(%rsp), %rdx /* rdx = ilowest */
    +    subq %rdx,     %rax /* rax = ip0 - ilowest */
    +    movq %rax,     %rbx /* rbx = ip0 - ilowest */
     
    -    /* rdx = (ip0 - ilimit) / 7 */
    +    /* rdx = (ip0 - ilowest) / 7 */
         movabsq $2635249153387078803, %rdx
         mulq %rdx
         subq %rdx, %rbx
    @@ -183,9 +204,8 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
     
         /* If (op3 + 20 > olimit) */
         movq %op3, %rax    /* rax = op3 */
    -    addq $20,  %rax    /* rax = op3 + 20 */
    -    cmpq %rax, %olimit /* op3 + 20 > olimit */
    -    jb .L_4X1_exit
    +    cmpq %rax, %olimit /* op3 == olimit */
    +    je .L_4X1_exit
     
         /* If (ip1 < ip0) go to exit */
         cmpq %ip0, %ip1
    @@ -316,7 +336,7 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
         /* Restore stack (oend & olimit) */
         pop %rax /* olimit */
         pop %rax /* oend */
    -    pop %rax /* ilimit */
    +    pop %rax /* ilowest */
         pop %rax /* arg */
     
         /* Save ip / op / bits */
    @@ -387,7 +407,7 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
         movq 96(%rax), %dtable
         push %rax      /* argument */
         push %rax      /* olimit */
    -    push 104(%rax) /* ilimit */
    +    push 104(%rax) /* ilowest */
     
         movq 112(%rax), %rax
         push %rax /* oend3 */
    @@ -414,9 +434,9 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
     
         /* We can consume up to 7 input bytes each iteration. */
         movq %ip0,     %rax  /* rax = ip0 */
    -    movq 40(%rsp), %rdx  /* rdx = ilimit */
    -    subq %rdx,     %rax  /* rax = ip0 - ilimit */
    -    movq %rax,    %r15   /* r15 = ip0 - ilimit */
    +    movq 40(%rsp), %rdx  /* rdx = ilowest */
    +    subq %rdx,     %rax  /* rax = ip0 - ilowest */
    +    movq %rax,    %r15   /* r15 = ip0 - ilowest */
     
         /* rdx = rax / 7 */
         movabsq $2635249153387078803, %rdx
    @@ -426,7 +446,7 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
         addq %r15, %rdx
         shrq $2, %rdx
     
    -    /* r15 = (ip0 - ilimit) / 7 */
    +    /* r15 = (ip0 - ilowest) / 7 */
         movq %rdx, %r15
     
         /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
    @@ -467,9 +487,8 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
     
         /* If (op3 + 10 > olimit) */
         movq %op3, %rax    /* rax = op3 */
    -    addq $10,  %rax    /* rax = op3 + 10 */
    -    cmpq %rax, %olimit /* op3 + 10 > olimit */
    -    jb .L_4X2_exit
    +    cmpq %rax, %olimit /* op3 == olimit */
    +    je .L_4X2_exit
     
         /* If (ip1 < ip0) go to exit */
         cmpq %ip0, %ip1
    @@ -537,7 +556,7 @@ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
         pop %rax /* oend1 */
         pop %rax /* oend2 */
         pop %rax /* oend3 */
    -    pop %rax /* ilimit */
    +    pop %rax /* ilowest */
         pop %rax /* olimit */
         pop %rax /* arg */
     
    diff --git a/third-party/zstd/lib/decompress/zstd_decompress.c b/third-party/zstd/lib/decompress/zstd_decompress.c
    index 7bc27134..2f03cf7b 100644
    --- a/third-party/zstd/lib/decompress/zstd_decompress.c
    +++ b/third-party/zstd/lib/decompress/zstd_decompress.c
    @@ -55,18 +55,19 @@
     /*-*******************************************************
     *  Dependencies
     *********************************************************/
    -#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
     #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
    +#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
    +#include "../common/error_private.h"
    +#include "../common/zstd_internal.h"  /* blockProperties_t */
     #include "../common/mem.h"         /* low level memory routines */
    +#include "../common/bits.h"  /* ZSTD_highbit32 */
     #define FSE_STATIC_LINKING_ONLY
     #include "../common/fse.h"
     #include "../common/huf.h"
     #include "../common/xxhash.h" /* XXH64_reset, XXH64_update, XXH64_digest, XXH64 */
    -#include "../common/zstd_internal.h"  /* blockProperties_t */
     #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
     #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
     #include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
    -#include "../common/bits.h"  /* ZSTD_highbit32 */
     
     #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
     #  include "../legacy/zstd_legacy.h"
    @@ -245,6 +246,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
         dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
         dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
         dctx->disableHufAsm = 0;
    +    dctx->maxBlockSizeParam = 0;
     }
     
     static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
    @@ -265,6 +267,7 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
     #endif
         dctx->noForwardProgress = 0;
         dctx->oversizedDuration = 0;
    +    dctx->isFrameDecompression = 1;
     #if DYNAMIC_BMI2
         dctx->bmi2 = ZSTD_cpuSupportsBmi2();
     #endif
    @@ -726,17 +729,17 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret)
         return frameSizeInfo;
     }
     
    -static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize)
    +static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format)
     {
         ZSTD_frameSizeInfo frameSizeInfo;
         ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo));
     
     #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
    -    if (ZSTD_isLegacy(src, srcSize))
    +    if (format == ZSTD_f_zstd1 && ZSTD_isLegacy(src, srcSize))
             return ZSTD_findFrameSizeInfoLegacy(src, srcSize);
     #endif
     
    -    if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
    +    if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
             && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
             frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize);
             assert(ZSTD_isError(frameSizeInfo.compressedSize) ||
    @@ -750,7 +753,7 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
             ZSTD_frameHeader zfh;
     
             /* Extract Frame Header */
    -        {   size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
    +        {   size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format);
                 if (ZSTD_isError(ret))
                     return ZSTD_errorFrameSizeInfo(ret);
                 if (ret > 0)
    @@ -793,15 +796,17 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
         }
     }
     
    +static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) {
    +    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format);
    +    return frameSizeInfo.compressedSize;
    +}
    +
     /** ZSTD_findFrameCompressedSize() :
    - *  compatible with legacy mode
    - *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
    - *  `srcSize` must be at least as large as the frame contained
    - *  @return : the compressed size of the frame starting at `src` */
    + * See docs in zstd.h
    + * Note: compatible with legacy mode */
     size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
     {
    -    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
    -    return frameSizeInfo.compressedSize;
    +    return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1);
     }
     
     /** ZSTD_decompressBound() :
    @@ -815,7 +820,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
         unsigned long long bound = 0;
         /* Iterate over each frame */
         while (srcSize > 0) {
    -        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
    +        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
             size_t const compressedSize = frameSizeInfo.compressedSize;
             unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
             if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
    @@ -835,7 +840,7 @@ size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
     
         /* Iterate over each frame */
         while (srcSize > 0) {
    -        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
    +        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
             size_t const compressedSize = frameSizeInfo.compressedSize;
             unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
             ZSTD_frameHeader zfh;
    @@ -971,6 +976,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
             ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize;
         }
     
    +    /* Shrink the blockSizeMax if enabled */
    +    if (dctx->maxBlockSizeParam != 0)
    +        dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam);
    +
         /* Loop on each block */
         while (1) {
             BYTE* oBlockEnd = oend;
    @@ -1003,7 +1012,8 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
             switch(blockProperties.blockType)
             {
             case bt_compressed:
    -            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming);
    +            assert(dctx->isFrameDecompression == 1);
    +            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming);
                 break;
             case bt_raw :
                 /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */
    @@ -1016,12 +1026,14 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
             default:
                 RETURN_ERROR(corruption_detected, "invalid block type");
             }
    -
    -        if (ZSTD_isError(decodedSize)) return decodedSize;
    -        if (dctx->validateChecksum)
    +        FORWARD_IF_ERROR(decodedSize, "Block decompression failure");
    +        DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize);
    +        if (dctx->validateChecksum) {
                 XXH64_update(&dctx->xxhState, op, decodedSize);
    -        if (decodedSize != 0)
    +        }
    +        if (decodedSize) /* support dst = NULL,0 */ {
                 op += decodedSize;
    +        }
             assert(ip != NULL);
             ip += cBlockSize;
             remainingSrcSize -= cBlockSize;
    @@ -1051,7 +1063,9 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
         return (size_t)(op-ostart);
     }
     
    -static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
    +static
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
    +size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
                                             void* dst, size_t dstCapacity,
                                       const void* src, size_t srcSize,
                                       const void* dict, size_t dictSize,
    @@ -1071,7 +1085,7 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
         while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
     
     #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
    -        if (ZSTD_isLegacy(src, srcSize)) {
    +        if (dctx->format == ZSTD_f_zstd1 && ZSTD_isLegacy(src, srcSize)) {
                 size_t decodedSize;
                 size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize);
                 if (ZSTD_isError(frameSize)) return frameSize;
    @@ -1081,6 +1095,15 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
                 decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize);
                 if (ZSTD_isError(decodedSize)) return decodedSize;
     
    +            {
    +                unsigned long long const expectedSize = ZSTD_getFrameContentSize(src, srcSize);
    +                RETURN_ERROR_IF(expectedSize == ZSTD_CONTENTSIZE_ERROR, corruption_detected, "Corrupted frame header!");
    +                if (expectedSize != ZSTD_CONTENTSIZE_UNKNOWN) {
    +                    RETURN_ERROR_IF(expectedSize != decodedSize, corruption_detected,
    +                        "Frame header size does not match decoded size!");
    +                }
    +            }
    +
                 assert(decodedSize <= dstCapacity);
                 dst = (BYTE*)dst + decodedSize;
                 dstCapacity -= decodedSize;
    @@ -1092,7 +1115,7 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
             }
     #endif
     
    -        if (srcSize >= 4) {
    +        if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) {
                 U32 const magicNumber = MEM_readLE32(src);
                 DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
                 if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
    @@ -1319,7 +1342,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
                 {
                 case bt_compressed:
                     DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
    -                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming);
    +                assert(dctx->isFrameDecompression == 1);
    +                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming);
                     dctx->expected = 0;  /* Streaming not supported */
                     break;
                 case bt_raw :
    @@ -1388,6 +1412,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
         case ZSTDds_decodeSkippableHeader:
             assert(src != NULL);
             assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE);
    +        assert(dctx->format != ZSTD_f_zstd1_magicless);
             ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize);   /* complete skippable header */
             dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
             dctx->stage = ZSTDds_skipFrame;
    @@ -1548,6 +1573,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
         dctx->litEntropy = dctx->fseEntropy = 0;
         dctx->dictID = 0;
         dctx->bType = bt_reserved;
    +    dctx->isFrameDecompression = 1;
         ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
         ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
         dctx->LLTptr = dctx->entropy.LLTable;
    @@ -1819,6 +1845,10 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
                 bounds.lowerBound = 0;
                 bounds.upperBound = 1;
                 return bounds;
    +        case ZSTD_d_maxBlockSize:
    +            bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
    +            bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
    +            return bounds;
     
             default:;
         }
    @@ -1863,6 +1893,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
             case ZSTD_d_disableHuffmanAssembly:
                 *value = (int)dctx->disableHufAsm;
                 return 0;
    +        case ZSTD_d_maxBlockSize:
    +            *value = dctx->maxBlockSizeParam;
    +            return 0;
             default:;
         }
         RETURN_ERROR(parameter_unsupported, "");
    @@ -1900,6 +1933,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
                 CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
                 dctx->disableHufAsm = value != 0;
                 return 0;
    +        case ZSTD_d_maxBlockSize:
    +            if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value);
    +            dctx->maxBlockSizeParam = value;
    +            return 0;
             default:;
         }
         RETURN_ERROR(parameter_unsupported, "");
    @@ -1911,6 +1948,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset)
           || (reset == ZSTD_reset_session_and_parameters) ) {
             dctx->streamStage = zdss_init;
             dctx->noForwardProgress = 0;
    +        dctx->isFrameDecompression = 1;
         }
         if ( (reset == ZSTD_reset_parameters)
           || (reset == ZSTD_reset_session_and_parameters) ) {
    @@ -1927,11 +1965,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
         return ZSTD_sizeof_DCtx(dctx);
     }
     
    -size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
    +static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax)
     {
    -    size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
    -    /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/
    -    unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2);
    +    size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax);
    +    /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block
    +     * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing
    +     * the block at the beginning of the output buffer, and maintain a full window.
    +     *
    +     * We need another blockSize worth of buffer so that we can store split
    +     * literals at the end of the block without overwriting the extDict window.
    +     */
    +    unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2);
         unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
         size_t const minRBSize = (size_t) neededSize;
         RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
    @@ -1939,6 +1983,11 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
         return minRBSize;
     }
     
    +size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
    +{
    +    return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX);
    +}
    +
     size_t ZSTD_estimateDStreamSize(size_t windowSize)
     {
         size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
    @@ -2134,12 +2183,12 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
                     && zds->fParams.frameType != ZSTD_skippableFrame
                     && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
    -                size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart));
    +                size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format);
                     if (cSize <= (size_t)(iend-istart)) {
                         /* shortcut : using single-pass mode */
                         size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
                         if (ZSTD_isError(decompressedSize)) return decompressedSize;
    -                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
    +                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()");
                         assert(istart != NULL);
                         ip = istart + cSize;
                         op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
    @@ -2161,7 +2210,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 DEBUGLOG(4, "Consume header");
                 FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), "");
     
    -            if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
    +            if (zds->format == ZSTD_f_zstd1
    +                && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
                     zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
                     zds->stage = ZSTDds_skipFrame;
                 } else {
    @@ -2177,11 +2227,13 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
                 RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize,
                                 frameParameter_windowTooLarge, "");
    +            if (zds->maxBlockSizeParam != 0)
    +                zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam);
     
                 /* Adapt buffer sizes to frame header instructions */
                 {   size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
                     size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered
    -                        ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize)
    +                        ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax)
                             : 0;
     
                     ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize);
    diff --git a/third-party/zstd/lib/decompress/zstd_decompress_block.c b/third-party/zstd/lib/decompress/zstd_decompress_block.c
    index 09896a93..76d7332e 100644
    --- a/third-party/zstd/lib/decompress/zstd_decompress_block.c
    +++ b/third-party/zstd/lib/decompress/zstd_decompress_block.c
    @@ -51,6 +51,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
      *   Block decoding
      ***************************************************************/
     
    +static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx)
    +{
    +    size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX;
    +    assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX);
    +    return blockSizeMax;
    +}
    +
     /*! ZSTD_getcBlockSize() :
      *  Provides the size of compressed block from block header `src` */
     size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
    @@ -73,41 +80,49 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
     static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
         const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
     {
    -    if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
    -    {
    -        /* room for litbuffer to fit without read faulting */
    -        dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
    +    size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
    +    assert(litSize <= blockSizeMax);
    +    assert(dctx->isFrameDecompression || streaming == not_streaming);
    +    assert(expectedWriteSize <= blockSizeMax);
    +    if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) {
    +        /* If we aren't streaming, we can just put the literals after the output
    +         * of the current block. We don't need to worry about overwriting the
    +         * extDict of our window, because it doesn't exist.
    +         * So if we have space after the end of the block, just put it there.
    +         */
    +        dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH;
             dctx->litBufferEnd = dctx->litBuffer + litSize;
             dctx->litBufferLocation = ZSTD_in_dst;
    -    }
    -    else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
    -    {
    -        /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
    +    } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) {
    +        /* Literals fit entirely within the extra buffer, put them there to avoid
    +         * having to split the literals.
    +         */
    +        dctx->litBuffer = dctx->litExtraBuffer;
    +        dctx->litBufferEnd = dctx->litBuffer + litSize;
    +        dctx->litBufferLocation = ZSTD_not_in_dst;
    +    } else {
    +        assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE);
    +        /* Literals must be split between the output block and the extra lit
    +         * buffer. We fill the extra lit buffer with the tail of the literals,
    +         * and put the rest of the literals at the end of the block, with
    +         * WILDCOPY_OVERLENGTH of buffer room to allow for overreads.
    +         * This MUST not write more than our maxBlockSize beyond dst, because in
    +         * streaming mode, that could overwrite part of our extDict window.
    +         */
             if (splitImmediately) {
                 /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
                 dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
                 dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
    -        }
    -        else {
    +        } else {
                 /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
                 dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
                 dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
             }
             dctx->litBufferLocation = ZSTD_split;
    -    }
    -    else
    -    {
    -        /* fits entirely within litExtraBuffer, so no split is necessary */
    -        dctx->litBuffer = dctx->litExtraBuffer;
    -        dctx->litBufferEnd = dctx->litBuffer + litSize;
    -        dctx->litBufferLocation = ZSTD_not_in_dst;
    +        assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize);
         }
     }
     
    -/* Hidden declaration for fullbench */
    -size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
    -                          const void* src, size_t srcSize,
    -                          void* dst, size_t dstCapacity, const streaming_operation streaming);
     /*! ZSTD_decodeLiteralsBlock() :
      * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
      * in the dstBuffer.  If there is room to do so, it will be stored in full in the excess dst space after where the current
    @@ -116,7 +131,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
      *
      * @return : nb of bytes read from src (< srcSize )
      *  note : symbol not declared but exposed for fullbench */
    -size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
    +static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                               const void* src, size_t srcSize,   /* note : srcSize < BLOCKSIZE */
                               void* dst, size_t dstCapacity, const streaming_operation streaming)
     {
    @@ -125,6 +140,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
     
         {   const BYTE* const istart = (const BYTE*) src;
             symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
    +        size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
     
             switch(litEncType)
             {
    @@ -140,7 +156,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     U32 const lhlCode = (istart[0] >> 2) & 3;
                     U32 const lhc = MEM_readLE32(istart);
                     size_t hufSuccess;
    -                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
    +                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
                     int const flags = 0
                         | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
                         | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
    @@ -167,7 +183,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                         break;
                     }
                     RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
    -                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
    +                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
                     if (!singleStream)
                         RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
                             "Not enough literals (%zu) for the 4-streams mode (min %u)",
    @@ -214,10 +230,12 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     }
                     if (dctx->litBufferLocation == ZSTD_split)
                     {
    +                    assert(litSize > ZSTD_LITBUFFEREXTRASIZE);
                         ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
                         ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
                         dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
                         dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
    +                    assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax);
                     }
     
                     RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
    @@ -232,7 +250,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
             case set_basic:
                 {   size_t litSize, lhSize;
                     U32 const lhlCode = ((istart[0]) >> 2) & 3;
    -                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
    +                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
                     switch(lhlCode)
                     {
                     case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
    @@ -251,6 +269,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     }
     
                     RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
    +                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
                     RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
                     ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
                     if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
    @@ -279,7 +298,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
             case set_rle:
                 {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
                     size_t litSize, lhSize;
    -                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
    +                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
                     switch(lhlCode)
                     {
                     case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
    @@ -298,7 +317,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                         break;
                     }
                     RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
    -                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
    +                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
                     RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
                     ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
                     if (dctx->litBufferLocation == ZSTD_split)
    @@ -320,6 +339,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
         }
     }
     
    +/* Hidden declaration for fullbench */
    +size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
    +                          const void* src, size_t srcSize,
    +                          void* dst, size_t dstCapacity);
    +size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
    +                          const void* src, size_t srcSize,
    +                          void* dst, size_t dstCapacity)
    +{
    +    dctx->isFrameDecompression = 0;
    +    return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming);
    +}
    +
     /* Default FSE distribution tables.
      * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
      * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
    @@ -675,11 +706,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
     
         /* SeqHead */
         nbSeq = *ip++;
    -    if (!nbSeq) {
    -        *nbSeqPtr=0;
    -        RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
    -        return 1;
    -    }
         if (nbSeq > 0x7F) {
             if (nbSeq == 0xFF) {
                 RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
    @@ -692,8 +718,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
         }
         *nbSeqPtr = nbSeq;
     
    +    if (nbSeq == 0) {
    +        /* No sequence : section ends immediately */
    +        RETURN_ERROR_IF(ip != iend, corruption_detected,
    +            "extraneous data present in the Sequences section");
    +        return (size_t)(ip - istart);
    +    }
    +
         /* FSE table descriptors */
         RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
    +    RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */
         {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
             symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
             symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
    @@ -840,7 +874,7 @@ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, pt
     /* ZSTD_safecopyDstBeforeSrc():
      * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
      * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
    -static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
    +static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) {
         ptrdiff_t const diff = op - ip;
         BYTE* const oend = op + length;
     
    @@ -869,6 +903,7 @@ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length
      * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
      */
     FORCE_NOINLINE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
     size_t ZSTD_execSequenceEnd(BYTE* op,
         BYTE* const oend, seq_t sequence,
         const BYTE** litPtr, const BYTE* const litLimit,
    @@ -916,6 +951,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
      * This version is intended to be used during instances where the litBuffer is still split.  It is kept separate to avoid performance impact for the good case.
      */
     FORCE_NOINLINE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
     size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
         BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
         const BYTE** litPtr, const BYTE* const litLimit,
    @@ -961,6 +997,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
     }
     
     HINT_INLINE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
     size_t ZSTD_execSequence(BYTE* op,
         BYTE* const oend, seq_t sequence,
         const BYTE** litPtr, const BYTE* const litLimit,
    @@ -1059,6 +1096,7 @@ size_t ZSTD_execSequence(BYTE* op,
     }
     
     HINT_INLINE
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
     size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
         BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
         const BYTE** litPtr, const BYTE* const litLimit,
    @@ -1181,14 +1219,20 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
     
     typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
     
    +/**
    + * ZSTD_decodeSequence():
    + * @p longOffsets : tells the decoder to reload more bit while decoding large offsets
    + *                  only used in 32-bit mode
    + * @return : Sequence (litL + matchL + offset)
    + */
     FORCE_INLINE_TEMPLATE seq_t
    -ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
    +ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq)
     {
         seq_t seq;
         /*
    -     * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
    -     * loaded in one operation and extracted its fields by simply shifting or
    -     * bit-extracting on aarch64.
    +     * ZSTD_seqSymbol is a 64 bits wide structure.
    +     * It can be loaded in one operation
    +     * and its fields extracted by simply shifting or bit-extracting on aarch64.
          * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
          * operations that cause performance drop. This can be avoided by using this
          * ZSTD_memcpy hack.
    @@ -1261,7 +1305,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
                     } else {
                         offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
                         {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
    -                        temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
    +                        temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */
                             if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
                             seqState->prevOffset[1] = seqState->prevOffset[0];
                             seqState->prevOffset[0] = offset = temp;
    @@ -1288,17 +1332,22 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
             DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
                         (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
     
    -        ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
    -        ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
    -        if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
    -        ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
    +        if (!isLastSeq) {
    +            /* don't update FSE state for last Sequence */
    +            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
    +            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
    +            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
    +            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
    +            BIT_reloadDStream(&seqState->DStream);
    +        }
         }
     
         return seq;
     }
     
    -#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
    -MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
    +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
    +#if DEBUGLEVEL >= 1
    +static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
     {
         size_t const windowSize = dctx->fParams.windowSize;
         /* No dictionary used. */
    @@ -1312,30 +1361,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix
         /* Dictionary is active. */
         return 1;
     }
    +#endif
     
    -MEM_STATIC void ZSTD_assertValidSequence(
    +static void ZSTD_assertValidSequence(
             ZSTD_DCtx const* dctx,
             BYTE const* op, BYTE const* oend,
             seq_t const seq,
             BYTE const* prefixStart, BYTE const* virtualStart)
     {
     #if DEBUGLEVEL >= 1
    -    size_t const windowSize = dctx->fParams.windowSize;
    -    size_t const sequenceSize = seq.litLength + seq.matchLength;
    -    BYTE const* const oLitEnd = op + seq.litLength;
    -    DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
    -            (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
    -    assert(op <= oend);
    -    assert((size_t)(oend - op) >= sequenceSize);
    -    assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
    -    if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
    -        size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
    -        /* Offset must be within the dictionary. */
    -        assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
    -        assert(seq.offset <= windowSize + dictSize);
    -    } else {
    -        /* Offset must be within our window. */
    -        assert(seq.offset <= windowSize);
    +    if (dctx->isFrameDecompression) {
    +        size_t const windowSize = dctx->fParams.windowSize;
    +        size_t const sequenceSize = seq.litLength + seq.matchLength;
    +        BYTE const* const oLitEnd = op + seq.litLength;
    +        DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
    +                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
    +        assert(op <= oend);
    +        assert((size_t)(oend - op) >= sequenceSize);
    +        assert(sequenceSize <= ZSTD_blockSizeMax(dctx));
    +        if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
    +            size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
    +            /* Offset must be within the dictionary. */
    +            assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
    +            assert(seq.offset <= windowSize + dictSize);
    +        } else {
    +            /* Offset must be within our window. */
    +            assert(seq.offset <= windowSize);
    +        }
         }
     #else
         (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
    @@ -1351,23 +1403,21 @@ DONT_VECTORIZE
     ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
                                    void* dst, size_t maxDstSize,
                              const void* seqStart, size_t seqSize, int nbSeq,
    -                         const ZSTD_longOffset_e isLongOffset,
    -                         const int frame)
    +                         const ZSTD_longOffset_e isLongOffset)
     {
         const BYTE* ip = (const BYTE*)seqStart;
         const BYTE* const iend = ip + seqSize;
         BYTE* const ostart = (BYTE*)dst;
    -    BYTE* const oend = ostart + maxDstSize;
    +    BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
         BYTE* op = ostart;
         const BYTE* litPtr = dctx->litPtr;
         const BYTE* litBufferEnd = dctx->litBufferEnd;
         const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
         const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
         const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
    -    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
    -    (void)frame;
    +    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq);
     
    -    /* Regen sequences */
    +    /* Literals are split between internal buffer & output buffer */
         if (nbSeq) {
             seqState_t seqState;
             dctx->fseEntropy = 1;
    @@ -1386,8 +1436,7 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
                     BIT_DStream_completed < BIT_DStream_overflow);
     
             /* decompress without overrunning litPtr begins */
    -        {
    -            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
    +        {   seq_t sequence = {0,0,0};  /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */
                 /* Align the decompression loop to 32 + 16 bytes.
                     *
                     * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
    @@ -1449,27 +1498,26 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
     #endif
     
                 /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
    -            for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
    -                size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
    +            for ( ; nbSeq; nbSeq--) {
    +                sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
    +                if (litPtr + sequence.litLength > dctx->litBufferEnd) break;
    +                {   size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
     #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
    -                assert(!ZSTD_isError(oneSeqSize));
    -                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
    +                    assert(!ZSTD_isError(oneSeqSize));
    +                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
     #endif
    -                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
    -                    return oneSeqSize;
    -                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
    -                op += oneSeqSize;
    -                if (UNLIKELY(!--nbSeq))
    -                    break;
    -                BIT_reloadDStream(&(seqState.DStream));
    -                sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
    -            }
    +                    if (UNLIKELY(ZSTD_isError(oneSeqSize)))
    +                        return oneSeqSize;
    +                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
    +                    op += oneSeqSize;
    +            }   }
    +            DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)");
     
                 /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
                 if (nbSeq > 0) {
                     const size_t leftoverLit = dctx->litBufferEnd - litPtr;
    -                if (leftoverLit)
    -                {
    +                DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength);
    +                if (leftoverLit) {
                         RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
                         ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
                         sequence.litLength -= leftoverLit;
    @@ -1478,24 +1526,22 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
                     litPtr = dctx->litExtraBuffer;
                     litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
                     dctx->litBufferLocation = ZSTD_not_in_dst;
    -                {
    -                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
    +                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
     #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
                         assert(!ZSTD_isError(oneSeqSize));
    -                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
    +                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
     #endif
                         if (UNLIKELY(ZSTD_isError(oneSeqSize)))
                             return oneSeqSize;
                         DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
                         op += oneSeqSize;
    -                    if (--nbSeq)
    -                        BIT_reloadDStream(&(seqState.DStream));
                     }
    +                nbSeq--;
                 }
             }
     
    -        if (nbSeq > 0) /* there is remaining lit from extra buffer */
    -        {
    +        if (nbSeq > 0) {
    +            /* there is remaining lit from extra buffer */
     
     #if defined(__GNUC__) && defined(__x86_64__)
                 __asm__(".p2align 6");
    @@ -1514,35 +1560,34 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
     #  endif
     #endif
     
    -            for (; ; ) {
    -                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
    +            for ( ; nbSeq ; nbSeq--) {
    +                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
                     size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
     #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
                     assert(!ZSTD_isError(oneSeqSize));
    -                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
    +                ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
     #endif
                     if (UNLIKELY(ZSTD_isError(oneSeqSize)))
                         return oneSeqSize;
                     DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
                     op += oneSeqSize;
    -                if (UNLIKELY(!--nbSeq))
    -                    break;
    -                BIT_reloadDStream(&(seqState.DStream));
                 }
             }
     
             /* check if reached exact end */
             DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
             RETURN_ERROR_IF(nbSeq, corruption_detected, "");
    -        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
    +        DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed);
    +        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
             /* save reps for next block */
             { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); }
         }
     
         /* last literal segment */
    -    if (dctx->litBufferLocation == ZSTD_split)  /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
    -    {
    -        size_t const lastLLSize = litBufferEnd - litPtr;
    +    if (dctx->litBufferLocation == ZSTD_split) {
    +        /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
    +        size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
    +        DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize);
             RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
             if (op != NULL) {
                 ZSTD_memmove(op, litPtr, lastLLSize);
    @@ -1552,15 +1597,17 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
             litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
             dctx->litBufferLocation = ZSTD_not_in_dst;
         }
    -    {   size_t const lastLLSize = litBufferEnd - litPtr;
    +    /* copy last literals from internal buffer */
    +    {   size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
    +        DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize);
             RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
             if (op != NULL) {
                 ZSTD_memcpy(op, litPtr, lastLLSize);
                 op += lastLLSize;
    -        }
    -    }
    +    }   }
     
    -    return op-ostart;
    +    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
    +    return (size_t)(op - ostart);
     }
     
     FORCE_INLINE_TEMPLATE size_t
    @@ -1568,13 +1615,12 @@ DONT_VECTORIZE
     ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
         void* dst, size_t maxDstSize,
         const void* seqStart, size_t seqSize, int nbSeq,
    -    const ZSTD_longOffset_e isLongOffset,
    -    const int frame)
    +    const ZSTD_longOffset_e isLongOffset)
     {
         const BYTE* ip = (const BYTE*)seqStart;
         const BYTE* const iend = ip + seqSize;
         BYTE* const ostart = (BYTE*)dst;
    -    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
    +    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer;
         BYTE* op = ostart;
         const BYTE* litPtr = dctx->litPtr;
         const BYTE* const litEnd = litPtr + dctx->litSize;
    @@ -1582,7 +1628,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
         const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
         const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
         DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
    -    (void)frame;
     
         /* Regen sequences */
         if (nbSeq) {
    @@ -1597,11 +1642,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
             ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
             assert(dst != NULL);
     
    -        ZSTD_STATIC_ASSERT(
    -            BIT_DStream_unfinished < BIT_DStream_completed &&
    -            BIT_DStream_endOfBuffer < BIT_DStream_completed &&
    -            BIT_DStream_completed < BIT_DStream_overflow);
    -
     #if defined(__GNUC__) && defined(__x86_64__)
                 __asm__(".p2align 6");
                 __asm__("nop");
    @@ -1616,73 +1656,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
     #  endif
     #endif
     
    -        for ( ; ; ) {
    -            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
    +        for ( ; nbSeq ; nbSeq--) {
    +            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
                 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
     #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
                 assert(!ZSTD_isError(oneSeqSize));
    -            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
    +            ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
     #endif
                 if (UNLIKELY(ZSTD_isError(oneSeqSize)))
                     return oneSeqSize;
                 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
                 op += oneSeqSize;
    -            if (UNLIKELY(!--nbSeq))
    -                break;
    -            BIT_reloadDStream(&(seqState.DStream));
             }
     
             /* check if reached exact end */
    -        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
    -        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
    -        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
    +        assert(nbSeq == 0);
    +        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
             /* save reps for next block */
             { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); }
         }
     
         /* last literal segment */
    -    {   size_t const lastLLSize = litEnd - litPtr;
    +    {   size_t const lastLLSize = (size_t)(litEnd - litPtr);
    +        DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize);
             RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
             if (op != NULL) {
                 ZSTD_memcpy(op, litPtr, lastLLSize);
                 op += lastLLSize;
    -        }
    -    }
    +    }   }
     
    -    return op-ostart;
    +    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
    +    return (size_t)(op - ostart);
     }
     
     static size_t
     ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
                                      void* dst, size_t maxDstSize,
                                const void* seqStart, size_t seqSize, int nbSeq,
    -                           const ZSTD_longOffset_e isLongOffset,
    -                           const int frame)
    +                           const ZSTD_longOffset_e isLongOffset)
     {
    -    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
    +    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
     }
     
     static size_t
     ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
                                                    void* dst, size_t maxDstSize,
                                              const void* seqStart, size_t seqSize, int nbSeq,
    -                                         const ZSTD_longOffset_e isLongOffset,
    -                                         const int frame)
    +                                         const ZSTD_longOffset_e isLongOffset)
     {
    -    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
    +    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
     }
     #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
     
     #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
     
    -FORCE_INLINE_TEMPLATE size_t
    -ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
    +FORCE_INLINE_TEMPLATE
    +
    +size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
                        const BYTE* const prefixStart, const BYTE* const dictEnd)
     {
         prefetchPos += sequence.litLength;
         {   const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
    -        const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
    -                                                                              * No consequence though : memory address is only used for prefetching, not for dereferencing */
    +        /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
    +         * No consequence though : memory address is only used for prefetching, not for dereferencing */
    +        const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset);
             PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE);   /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
         }
         return prefetchPos + sequence.matchLength;
    @@ -1697,20 +1734,18 @@ ZSTD_decompressSequencesLong_body(
                                    ZSTD_DCtx* dctx,
                                    void* dst, size_t maxDstSize,
                              const void* seqStart, size_t seqSize, int nbSeq,
    -                         const ZSTD_longOffset_e isLongOffset,
    -                         const int frame)
    +                         const ZSTD_longOffset_e isLongOffset)
     {
         const BYTE* ip = (const BYTE*)seqStart;
         const BYTE* const iend = ip + seqSize;
         BYTE* const ostart = (BYTE*)dst;
    -    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
    +    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
         BYTE* op = ostart;
         const BYTE* litPtr = dctx->litPtr;
         const BYTE* litBufferEnd = dctx->litBufferEnd;
         const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
         const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
         const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
    -    (void)frame;
     
         /* Regen sequences */
         if (nbSeq) {
    @@ -1735,20 +1770,17 @@ ZSTD_decompressSequencesLong_body(
             ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
     
             /* prepare in advance */
    -        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNblitBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
    -            {
    +            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) {
                     /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
                     const size_t leftoverLit = dctx->litBufferEnd - litPtr;
                     if (leftoverLit)
    @@ -1761,26 +1793,26 @@ ZSTD_decompressSequencesLong_body(
                     litPtr = dctx->litExtraBuffer;
                     litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
                     dctx->litBufferLocation = ZSTD_not_in_dst;
    -                oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
    +                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
     #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
    -                assert(!ZSTD_isError(oneSeqSize));
    -                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
    +                    assert(!ZSTD_isError(oneSeqSize));
    +                    ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
     #endif
    -                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
    +                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
     
    -                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
    -                sequences[seqNb & STORED_SEQS_MASK] = sequence;
    -                op += oneSeqSize;
    -            }
    +                    prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
    +                    sequences[seqNb & STORED_SEQS_MASK] = sequence;
    +                    op += oneSeqSize;
    +            }   }
                 else
                 {
                     /* lit buffer is either wholly contained in first or second split, or not split at all*/
    -                oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
    +                size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
                         ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
                         ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
     #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
                     assert(!ZSTD_isError(oneSeqSize));
    -                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
    +                ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
     #endif
                     if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
     
    @@ -1789,17 +1821,15 @@ ZSTD_decompressSequencesLong_body(
                     op += oneSeqSize;
                 }
             }
    -        RETURN_ERROR_IF(seqNblitBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
    -            {
    +            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) {
                     const size_t leftoverLit = dctx->litBufferEnd - litPtr;
    -                if (leftoverLit)
    -                {
    +                if (leftoverLit) {
                         RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
                         ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
                         sequence->litLength -= leftoverLit;
    @@ -1808,11 +1838,10 @@ ZSTD_decompressSequencesLong_body(
                     litPtr = dctx->litExtraBuffer;
                     litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
                     dctx->litBufferLocation = ZSTD_not_in_dst;
    -                {
    -                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
    +                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
     #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
                         assert(!ZSTD_isError(oneSeqSize));
    -                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
    +                    ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
     #endif
                         if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
                         op += oneSeqSize;
    @@ -1825,7 +1854,7 @@ ZSTD_decompressSequencesLong_body(
                         ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
     #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
                     assert(!ZSTD_isError(oneSeqSize));
    -                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
    +                ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
     #endif
                     if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
                     op += oneSeqSize;
    @@ -1837,8 +1866,7 @@ ZSTD_decompressSequencesLong_body(
         }
     
         /* last literal segment */
    -    if (dctx->litBufferLocation == ZSTD_split)  /* first deplete literal buffer in dst, then copy litExtraBuffer */
    -    {
    +    if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */
             size_t const lastLLSize = litBufferEnd - litPtr;
             RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
             if (op != NULL) {
    @@ -1856,17 +1884,16 @@ ZSTD_decompressSequencesLong_body(
             }
         }
     
    -    return op-ostart;
    +    return (size_t)(op - ostart);
     }
     
     static size_t
     ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
                                      void* dst, size_t maxDstSize,
                                const void* seqStart, size_t seqSize, int nbSeq,
    -                           const ZSTD_longOffset_e isLongOffset,
    -                           const int frame)
    +                           const ZSTD_longOffset_e isLongOffset)
     {
    -    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
    +    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
     }
     #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
     
    @@ -1880,20 +1907,18 @@ DONT_VECTORIZE
     ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
                                      void* dst, size_t maxDstSize,
                                const void* seqStart, size_t seqSize, int nbSeq,
    -                           const ZSTD_longOffset_e isLongOffset,
    -                           const int frame)
    +                           const ZSTD_longOffset_e isLongOffset)
     {
    -    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
    +    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
     }
     static BMI2_TARGET_ATTRIBUTE size_t
     DONT_VECTORIZE
     ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
                                      void* dst, size_t maxDstSize,
                                const void* seqStart, size_t seqSize, int nbSeq,
    -                           const ZSTD_longOffset_e isLongOffset,
    -                           const int frame)
    +                           const ZSTD_longOffset_e isLongOffset)
     {
    -    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
    +    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
     }
     #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
     
    @@ -1902,10 +1927,9 @@ static BMI2_TARGET_ATTRIBUTE size_t
     ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
                                      void* dst, size_t maxDstSize,
                                const void* seqStart, size_t seqSize, int nbSeq,
    -                           const ZSTD_longOffset_e isLongOffset,
    -                           const int frame)
    +                           const ZSTD_longOffset_e isLongOffset)
     {
    -    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
    +    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
     }
     #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
     
    @@ -1915,37 +1939,34 @@ typedef size_t (*ZSTD_decompressSequences_t)(
                                 ZSTD_DCtx* dctx,
                                 void* dst, size_t maxDstSize,
                                 const void* seqStart, size_t seqSize, int nbSeq,
    -                            const ZSTD_longOffset_e isLongOffset,
    -                            const int frame);
    +                            const ZSTD_longOffset_e isLongOffset);
     
     #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
     static size_t
     ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
                        const void* seqStart, size_t seqSize, int nbSeq,
    -                   const ZSTD_longOffset_e isLongOffset,
    -                   const int frame)
    +                   const ZSTD_longOffset_e isLongOffset)
     {
         DEBUGLOG(5, "ZSTD_decompressSequences");
     #if DYNAMIC_BMI2
         if (ZSTD_DCtx_get_bmi2(dctx)) {
    -        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
    +        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
         }
     #endif
    -    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
    +    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
     }
     static size_t
     ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
                                      const void* seqStart, size_t seqSize, int nbSeq,
    -                                 const ZSTD_longOffset_e isLongOffset,
    -                                 const int frame)
    +                                 const ZSTD_longOffset_e isLongOffset)
     {
         DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
     #if DYNAMIC_BMI2
         if (ZSTD_DCtx_get_bmi2(dctx)) {
    -        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
    +        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
         }
     #endif
    -    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
    +    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
     }
     #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
     
    @@ -1960,16 +1981,15 @@ static size_t
     ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
                                  void* dst, size_t maxDstSize,
                                  const void* seqStart, size_t seqSize, int nbSeq,
    -                             const ZSTD_longOffset_e isLongOffset,
    -                             const int frame)
    +                             const ZSTD_longOffset_e isLongOffset)
     {
         DEBUGLOG(5, "ZSTD_decompressSequencesLong");
     #if DYNAMIC_BMI2
         if (ZSTD_DCtx_get_bmi2(dctx)) {
    -        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
    +        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
         }
     #endif
    -  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
    +  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
     }
     #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
     
    @@ -2051,20 +2071,20 @@ static size_t ZSTD_maxShortOffset(void)
     size_t
     ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
                                   void* dst, size_t dstCapacity,
    -                        const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
    +                        const void* src, size_t srcSize, const streaming_operation streaming)
     {   /* blockType == blockCompressed */
         const BYTE* ip = (const BYTE*)src;
    -    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
    +    DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize);
     
         /* Note : the wording of the specification
    -     * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
    +     * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx).
          * This generally does not happen, as it makes little sense,
          * since an uncompressed block would feature same size and have no decompression cost.
          * Also, note that decoder from reference libzstd before < v1.5.4
          * would consider this edge case as an error.
    -     * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
    +     * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx)
          * for broader compatibility with the deployed ecosystem of zstd decoders */
    -    RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
    +    RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, "");
     
         /* Decode literals section */
         {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
    @@ -2079,8 +2099,8 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
             /* Compute the maximum block size, which must also work when !frame and fParams are unset.
              * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
              */
    -        size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
    -        size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);
    +        size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx));
    +        size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart);
             /* isLongOffset must be true if there are long offsets.
              * Offsets are long if they are larger than ZSTD_maxShortOffset().
              * We don't expect that to be the case in 64-bit mode.
    @@ -2145,21 +2165,22 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
             {
     #endif
     #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
    -            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
    +            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
     #endif
             }
     
     #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
             /* else */
             if (dctx->litBufferLocation == ZSTD_split)
    -            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
    +            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
             else
    -            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
    +            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
     #endif
         }
     }
     
     
    +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
     void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
     {
         if (dst != dctx->previousDstEnd && dstSize > 0) {   /* not contiguous */
    @@ -2176,8 +2197,10 @@ size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
                                      const void* src, size_t srcSize)
     {
         size_t dSize;
    +    dctx->isFrameDecompression = 0;
         ZSTD_checkContinuity(dctx, dst, dstCapacity);
    -    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
    +    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming);
    +    FORWARD_IF_ERROR(dSize, "");
         dctx->previousDstEnd = (char*)dst + dSize;
         return dSize;
     }
    diff --git a/third-party/zstd/lib/decompress/zstd_decompress_block.h b/third-party/zstd/lib/decompress/zstd_decompress_block.h
    index 9d131888..ab152404 100644
    --- a/third-party/zstd/lib/decompress/zstd_decompress_block.h
    +++ b/third-party/zstd/lib/decompress/zstd_decompress_block.h
    @@ -47,7 +47,7 @@ typedef enum {
      */
     size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
                                    void* dst, size_t dstCapacity,
    -                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming);
    +                         const void* src, size_t srcSize, const streaming_operation streaming);
     
     /* ZSTD_buildFSETable() :
      * generate FSE decoding table for one symbol (ll, ml or off)
    diff --git a/third-party/zstd/lib/decompress/zstd_decompress_internal.h b/third-party/zstd/lib/decompress/zstd_decompress_internal.h
    index c2ec5d9f..83a7a011 100644
    --- a/third-party/zstd/lib/decompress/zstd_decompress_internal.h
    +++ b/third-party/zstd/lib/decompress/zstd_decompress_internal.h
    @@ -153,6 +153,7 @@ struct ZSTD_DCtx_s
         size_t litSize;
         size_t rleSize;
         size_t staticSize;
    +    int isFrameDecompression;
     #if DYNAMIC_BMI2 != 0
         int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
     #endif
    @@ -166,6 +167,7 @@ struct ZSTD_DCtx_s
         ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
         ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
         int disableHufAsm;
    +    int maxBlockSizeParam;
     
         /* streaming */
         ZSTD_dStreamStage streamStage;
    diff --git a/third-party/zstd/lib/dictBuilder/cover.c b/third-party/zstd/lib/dictBuilder/cover.c
    index 9e5e7d5b..44f9029a 100644
    --- a/third-party/zstd/lib/dictBuilder/cover.c
    +++ b/third-party/zstd/lib/dictBuilder/cover.c
    @@ -31,8 +31,8 @@
     #endif
     
     #include "../common/mem.h" /* read */
    -#include "../common/pool.h"
    -#include "../common/threading.h"
    +#include "../common/pool.h" /* POOL_ctx */
    +#include "../common/threading.h" /* ZSTD_pthread_mutex_t */
     #include "../common/zstd_internal.h" /* includes zstd.h */
     #include "../common/bits.h" /* ZSTD_highbit32 */
     #include "../zdict.h"
    @@ -78,7 +78,7 @@ static clock_t g_time = 0;
     #undef  LOCALDISPLAYUPDATE
     #define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
       if (displayLevel >= l) {                                                     \
    -    if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) {             \
    +    if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) {           \
           g_time = clock();                                                        \
           DISPLAY(__VA_ARGS__);                                                    \
         }                                                                          \
    @@ -301,9 +301,10 @@ static int WIN_CDECL COVER_strict_cmp8(const void *lp, const void *rp) {
      * Returns the first pointer in [first, last) whose element does not compare
      * less than value.  If no such element exists it returns last.
      */
    -static const size_t *COVER_lower_bound(const size_t *first, const size_t *last,
    +static const size_t *COVER_lower_bound(const size_t* first, const size_t* last,
                                            size_t value) {
    -  size_t count = last - first;
    +  size_t count = (size_t)(last - first);
    +  assert(last >= first);
       while (count != 0) {
         size_t step = count / 2;
         const size_t *ptr = first;
    @@ -549,7 +550,8 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
      */
     static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
                               const size_t *samplesSizes, unsigned nbSamples,
    -                          unsigned d, double splitPoint) {
    +                          unsigned d, double splitPoint)
    +{
       const BYTE *const samples = (const BYTE *)samplesBuffer;
       const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
       /* Split samples into testing and training sets */
    @@ -733,7 +735,7 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
       return tail;
     }
     
    -ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
    +ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover(
         void *dictBuffer, size_t dictBufferCapacity,
         const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
         ZDICT_cover_params_t parameters)
    @@ -907,8 +909,10 @@ void COVER_best_start(COVER_best_t *best) {
      * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
      * If this dictionary is the best so far save it and its parameters.
      */
    -void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
    -                              COVER_dictSelection_t selection) {
    +void COVER_best_finish(COVER_best_t* best,
    +                      ZDICT_cover_params_t parameters,
    +                      COVER_dictSelection_t selection)
    +{
       void* dict = selection.dictContent;
       size_t compressedSize = selection.totalCompressedSize;
       size_t dictSize = selection.dictSize;
    @@ -980,8 +984,8 @@ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBuffe
       size_t largestCompressed = 0;
       BYTE* customDictContentEnd = customDictContent + dictContentSize;
     
    -  BYTE * largestDictbuffer = (BYTE *)malloc(dictBufferCapacity);
    -  BYTE * candidateDictBuffer = (BYTE *)malloc(dictBufferCapacity);
    +  BYTE* largestDictbuffer = (BYTE*)malloc(dictBufferCapacity);
    +  BYTE* candidateDictBuffer = (BYTE*)malloc(dictBufferCapacity);
       double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
     
       if (!largestDictbuffer || !candidateDictBuffer) {
    @@ -1119,7 +1123,7 @@ static void COVER_tryParameters(void *opaque)
       free(freqs);
     }
     
    -ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
    +ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover(
         void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer,
         const size_t* samplesSizes, unsigned nbSamples,
         ZDICT_cover_params_t* parameters)
    diff --git a/third-party/zstd/lib/dictBuilder/cover.h b/third-party/zstd/lib/dictBuilder/cover.h
    index 252624bd..a5d7506e 100644
    --- a/third-party/zstd/lib/dictBuilder/cover.h
    +++ b/third-party/zstd/lib/dictBuilder/cover.h
    @@ -12,14 +12,8 @@
     #  define ZDICT_STATIC_LINKING_ONLY
     #endif
     
    -#include   /* fprintf */
    -#include  /* malloc, free, qsort */
    -#include  /* memset */
    -#include    /* clock */
    -#include "../common/mem.h" /* read */
    -#include "../common/pool.h"
    -#include "../common/threading.h"
    -#include "../common/zstd_internal.h" /* includes zstd.h */
    +#include "../common/threading.h" /* ZSTD_pthread_mutex_t */
    +#include "../common/mem.h"   /* U32, BYTE */
     #include "../zdict.h"
     
     /**
    diff --git a/third-party/zstd/lib/dictBuilder/fastcover.c b/third-party/zstd/lib/dictBuilder/fastcover.c
    index 46bba012..a958eb33 100644
    --- a/third-party/zstd/lib/dictBuilder/fastcover.c
    +++ b/third-party/zstd/lib/dictBuilder/fastcover.c
    @@ -545,7 +545,7 @@ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
     }
     
     
    -ZDICTLIB_API size_t
    +ZDICTLIB_STATIC_API size_t
     ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
                                     const void* samplesBuffer,
                                     const size_t* samplesSizes, unsigned nbSamples,
    @@ -614,7 +614,7 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
     }
     
     
    -ZDICTLIB_API size_t
    +ZDICTLIB_STATIC_API size_t
     ZDICT_optimizeTrainFromBuffer_fastCover(
                         void* dictBuffer, size_t dictBufferCapacity,
                         const void* samplesBuffer,
    diff --git a/third-party/zstd/lib/dictBuilder/zdict.c b/third-party/zstd/lib/dictBuilder/zdict.c
    index 58290f45..82e999e8 100644
    --- a/third-party/zstd/lib/dictBuilder/zdict.c
    +++ b/third-party/zstd/lib/dictBuilder/zdict.c
    @@ -74,9 +74,9 @@ static const U32 g_selectivity_default = 9;
     *  Console display
     ***************************************/
     #undef  DISPLAY
    -#define DISPLAY(...)         { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
    +#define DISPLAY(...)         do { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } while (0)
     #undef  DISPLAYLEVEL
    -#define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); }    /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
    +#define DISPLAYLEVEL(l, ...) do { if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } } while (0)    /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
     
     static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
     
    @@ -477,10 +477,16 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
         clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
     
     #   undef  DISPLAYUPDATE
    -#   define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
    -            if (ZDICT_clockSpan(displayClock) > refreshRate)  \
    -            { displayClock = clock(); DISPLAY(__VA_ARGS__); \
    -            if (notificationLevel>=4) fflush(stderr); } }
    +#   define DISPLAYUPDATE(l, ...)                                   \
    +        do {                                                       \
    +            if (notificationLevel>=l) {                            \
    +                if (ZDICT_clockSpan(displayClock) > refreshRate) { \
    +                    displayClock = clock();                        \
    +                    DISPLAY(__VA_ARGS__);                          \
    +                }                                                  \
    +                if (notificationLevel>=4) fflush(stderr);          \
    +            }                                                      \
    +        } while (0)
     
         /* init */
         DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
    diff --git a/third-party/zstd/lib/legacy/zstd_legacy.h b/third-party/zstd/lib/legacy/zstd_legacy.h
    index dd173251..7a8a04e5 100644
    --- a/third-party/zstd/lib/legacy/zstd_legacy.h
    +++ b/third-party/zstd/lib/legacy/zstd_legacy.h
    @@ -124,6 +124,20 @@ MEM_STATIC size_t ZSTD_decompressLegacy(
                    const void* dict,size_t dictSize)
     {
         U32 const version = ZSTD_isLegacy(src, compressedSize);
    +    char x;
    +    /* Avoid passing NULL to legacy decoding. */
    +    if (dst == NULL) {
    +        assert(dstCapacity == 0);
    +        dst = &x;
    +    }
    +    if (src == NULL) {
    +        assert(compressedSize == 0);
    +        src = &x;
    +    }
    +    if (dict == NULL) {
    +        assert(dictSize == 0);
    +        dict = &x;
    +    }
         (void)dst; (void)dstCapacity; (void)dict; (void)dictSize;  /* unused when ZSTD_LEGACY_SUPPORT >= 8 */
         switch(version)
         {
    @@ -287,6 +301,12 @@ MEM_STATIC size_t ZSTD_freeLegacyStreamContext(void* legacyContext, U32 version)
     MEM_STATIC size_t ZSTD_initLegacyStream(void** legacyContext, U32 prevVersion, U32 newVersion,
                                             const void* dict, size_t dictSize)
     {
    +    char x;
    +    /* Avoid passing NULL to legacy decoding. */
    +    if (dict == NULL) {
    +        assert(dictSize == 0);
    +        dict = &x;
    +    }
         DEBUGLOG(5, "ZSTD_initLegacyStream for v0.%u", newVersion);
         if (prevVersion != newVersion) ZSTD_freeLegacyStreamContext(*legacyContext, prevVersion);
         switch(newVersion)
    @@ -346,6 +366,16 @@ MEM_STATIC size_t ZSTD_initLegacyStream(void** legacyContext, U32 prevVersion, U
     MEM_STATIC size_t ZSTD_decompressLegacyStream(void* legacyContext, U32 version,
                                                   ZSTD_outBuffer* output, ZSTD_inBuffer* input)
     {
    +    static char x;
    +    /* Avoid passing NULL to legacy decoding. */
    +    if (output->dst == NULL) {
    +        assert(output->size == 0);
    +        output->dst = &x;
    +    }
    +    if (input->src == NULL) {
    +        assert(input->size == 0);
    +        input->src = &x;
    +    }
         DEBUGLOG(5, "ZSTD_decompressLegacyStream for v0.%u", version);
         switch(version)
         {
    diff --git a/third-party/zstd/lib/legacy/zstd_v01.c b/third-party/zstd/lib/legacy/zstd_v01.c
    index 1a3aad07..6cf51234 100644
    --- a/third-party/zstd/lib/legacy/zstd_v01.c
    +++ b/third-party/zstd/lib/legacy/zstd_v01.c
    @@ -14,6 +14,7 @@
     ******************************************/
     #include     /* size_t, ptrdiff_t */
     #include "zstd_v01.h"
    +#include "../common/compiler.h"
     #include "../common/error_private.h"
     
     
    @@ -2118,6 +2119,7 @@ size_t ZSTDv01_decompressContinue(ZSTDv01_Dctx* dctx, void* dst, size_t maxDstSi
             }
             ctx->phase = 1;
             ctx->expected = ZSTD_blockHeaderSize;
    +        if (ZSTDv01_isError(rSize)) return rSize;
             ctx->previousDstEnd = (void*)( ((char*)dst) + rSize);
             return rSize;
         }
    diff --git a/third-party/zstd/lib/legacy/zstd_v02.c b/third-party/zstd/lib/legacy/zstd_v02.c
    index e09bb4a2..6d39b6e5 100644
    --- a/third-party/zstd/lib/legacy/zstd_v02.c
    +++ b/third-party/zstd/lib/legacy/zstd_v02.c
    @@ -11,6 +11,7 @@
     
     #include     /* size_t, ptrdiff_t */
     #include "zstd_v02.h"
    +#include "../common/compiler.h"
     #include "../common/error_private.h"
     
     
    @@ -71,20 +72,6 @@ extern "C" {
     #include     /* memcpy */
     
     
    -/******************************************
    -*  Compiler-specific
    -******************************************/
    -#if defined(__GNUC__)
    -#  define MEM_STATIC static __attribute__((unused))
    -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
    -#  define MEM_STATIC static inline
    -#elif defined(_MSC_VER)
    -#  define MEM_STATIC static __inline
    -#else
    -#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
    -#endif
    -
    -
     /****************************************************************
     *  Basic Types
     *****************************************************************/
    @@ -875,7 +862,7 @@ extern "C" {
     *  Streaming functions
     ***************************************/
     
    -typedef struct ZSTD_DCtx_s ZSTD_DCtx;
    +typedef struct ZSTDv02_Dctx_s ZSTD_DCtx;
     
     /*
       Use above functions alternatively.
    @@ -2750,7 +2737,7 @@ static unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
     /* *************************************************************
     *   Decompression section
     ***************************************************************/
    -struct ZSTD_DCtx_s
    +struct ZSTDv02_Dctx_s
     {
         U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
         U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
    @@ -3431,6 +3418,7 @@ static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSi
             }
             ctx->phase = 1;
             ctx->expected = ZSTD_blockHeaderSize;
    +        if (ZSTD_isError(rSize)) return rSize;
             ctx->previousDstEnd = (void*)( ((char*)dst) + rSize);
             return rSize;
         }
    diff --git a/third-party/zstd/lib/legacy/zstd_v03.c b/third-party/zstd/lib/legacy/zstd_v03.c
    index b0d7f521..47195f33 100644
    --- a/third-party/zstd/lib/legacy/zstd_v03.c
    +++ b/third-party/zstd/lib/legacy/zstd_v03.c
    @@ -11,6 +11,7 @@
     
     #include     /* size_t, ptrdiff_t */
     #include "zstd_v03.h"
    +#include "../common/compiler.h"
     #include "../common/error_private.h"
     
     
    @@ -72,20 +73,6 @@ extern "C" {
     #include     /* memcpy */
     
     
    -/******************************************
    -*  Compiler-specific
    -******************************************/
    -#if defined(__GNUC__)
    -#  define MEM_STATIC static __attribute__((unused))
    -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
    -#  define MEM_STATIC static inline
    -#elif defined(_MSC_VER)
    -#  define MEM_STATIC static __inline
    -#else
    -#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
    -#endif
    -
    -
     /****************************************************************
     *  Basic Types
     *****************************************************************/
    @@ -875,7 +862,7 @@ extern "C" {
     *  Streaming functions
     ***************************************/
     
    -typedef struct ZSTD_DCtx_s ZSTD_DCtx;
    +typedef struct ZSTDv03_Dctx_s ZSTD_DCtx;
     
     /*
       Use above functions alternatively.
    @@ -2390,7 +2377,7 @@ static unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
     /* *************************************************************
     *   Decompression section
     ***************************************************************/
    -struct ZSTD_DCtx_s
    +struct ZSTDv03_Dctx_s
     {
         U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
         U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
    @@ -3071,6 +3058,7 @@ static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSi
             }
             ctx->phase = 1;
             ctx->expected = ZSTD_blockHeaderSize;
    +        if (ZSTD_isError(rSize)) return rSize;
             ctx->previousDstEnd = (void*)( ((char*)dst) + rSize);
             return rSize;
         }
    diff --git a/third-party/zstd/lib/legacy/zstd_v04.c b/third-party/zstd/lib/legacy/zstd_v04.c
    index 57be832b..0da316c1 100644
    --- a/third-party/zstd/lib/legacy/zstd_v04.c
    +++ b/third-party/zstd/lib/legacy/zstd_v04.c
    @@ -16,6 +16,7 @@
     #include     /* memcpy */
     
     #include "zstd_v04.h"
    +#include "../common/compiler.h"
     #include "../common/error_private.h"
     
     
    @@ -37,15 +38,6 @@ extern "C" {
     #   include   /* _byteswap_ulong */
     #   include   /* _byteswap_* */
     #endif
    -#if defined(__GNUC__)
    -#  define MEM_STATIC static __attribute__((unused))
    -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
    -#  define MEM_STATIC static inline
    -#elif defined(_MSC_VER)
    -#  define MEM_STATIC static __inline
    -#else
    -#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
    -#endif
     
     
     /****************************************************************
    @@ -3218,6 +3210,7 @@ static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSi
                 }
                 ctx->stage = ZSTDds_decodeBlockHeader;
                 ctx->expected = ZSTD_blockHeaderSize;
    +            if (ZSTD_isError(rSize)) return rSize;
                 ctx->previousDstEnd = (char*)dst + rSize;
                 return rSize;
             }
    @@ -3545,8 +3538,8 @@ static size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDs
     unsigned ZBUFFv04_isError(size_t errorCode) { return ERR_isError(errorCode); }
     const char* ZBUFFv04_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
     
    -size_t ZBUFFv04_recommendedDInSize()  { return BLOCKSIZE + 3; }
    -size_t ZBUFFv04_recommendedDOutSize() { return BLOCKSIZE; }
    +size_t ZBUFFv04_recommendedDInSize(void)  { return BLOCKSIZE + 3; }
    +size_t ZBUFFv04_recommendedDOutSize(void) { return BLOCKSIZE; }
     
     
     
    diff --git a/third-party/zstd/lib/legacy/zstd_v05.c b/third-party/zstd/lib/legacy/zstd_v05.c
    index 93a1169f..44a877bf 100644
    --- a/third-party/zstd/lib/legacy/zstd_v05.c
    +++ b/third-party/zstd/lib/legacy/zstd_v05.c
    @@ -3600,6 +3600,7 @@ size_t ZSTDv05_decompressContinue(ZSTDv05_DCtx* dctx, void* dst, size_t maxDstSi
                 }
                 dctx->stage = ZSTDv05ds_decodeBlockHeader;
                 dctx->expected = ZSTDv05_blockHeaderSize;
    +            if (ZSTDv05_isError(rSize)) return rSize;
                 dctx->previousDstEnd = (char*)dst + rSize;
                 return rSize;
             }
    diff --git a/third-party/zstd/lib/legacy/zstd_v06.c b/third-party/zstd/lib/legacy/zstd_v06.c
    index 175f7cc4..00d6ef79 100644
    --- a/third-party/zstd/lib/legacy/zstd_v06.c
    +++ b/third-party/zstd/lib/legacy/zstd_v06.c
    @@ -14,6 +14,7 @@
     #include     /* size_t, ptrdiff_t */
     #include     /* memcpy */
     #include     /* malloc, free, qsort */
    +#include "../common/compiler.h"
     #include "../common/error_private.h"
     
     
    @@ -67,15 +68,6 @@ extern "C" {
     #   include   /* _byteswap_ulong */
     #   include   /* _byteswap_* */
     #endif
    -#if defined(__GNUC__)
    -#  define MEM_STATIC static __attribute__((unused))
    -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
    -#  define MEM_STATIC static inline
    -#elif defined(_MSC_VER)
    -#  define MEM_STATIC static __inline
    -#else
    -#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
    -#endif
     
     
     /*-**************************************************************
    @@ -3745,6 +3737,7 @@ size_t ZSTDv06_decompressContinue(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapac
                 }
                 dctx->stage = ZSTDds_decodeBlockHeader;
                 dctx->expected = ZSTDv06_blockHeaderSize;
    +            if (ZSTDv06_isError(rSize)) return rSize;
                 dctx->previousDstEnd = (char*)dst + rSize;
                 return rSize;
             }
    diff --git a/third-party/zstd/lib/legacy/zstd_v07.c b/third-party/zstd/lib/legacy/zstd_v07.c
    index 15dc3ef7..8778f079 100644
    --- a/third-party/zstd/lib/legacy/zstd_v07.c
    +++ b/third-party/zstd/lib/legacy/zstd_v07.c
    @@ -24,6 +24,7 @@
     #define HUFv07_STATIC_LINKING_ONLY   /* HUFv07_TABLELOG_ABSOLUTEMAX */
     #define ZSTDv07_STATIC_LINKING_ONLY
     
    +#include "../common/compiler.h"
     #include "../common/error_private.h"
     
     
    @@ -227,15 +228,6 @@ extern "C" {
     #   include   /* _byteswap_ulong */
     #   include   /* _byteswap_* */
     #endif
    -#if defined(__GNUC__)
    -#  define MEM_STATIC static __attribute__((unused))
    -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
    -#  define MEM_STATIC static inline
    -#elif defined(_MSC_VER)
    -#  define MEM_STATIC static __inline
    -#else
    -#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
    -#endif
     
     
     /*-**************************************************************
    @@ -4015,8 +4007,8 @@ size_t ZSTDv07_decompressContinue(ZSTDv07_DCtx* dctx, void* dst, size_t dstCapac
                 }
                 dctx->stage = ZSTDds_decodeBlockHeader;
                 dctx->expected = ZSTDv07_blockHeaderSize;
    -            dctx->previousDstEnd = (char*)dst + rSize;
                 if (ZSTDv07_isError(rSize)) return rSize;
    +            dctx->previousDstEnd = (char*)dst + rSize;
                 if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, dst, rSize);
                 return rSize;
             }
    diff --git a/third-party/zstd/lib/libzstd.mk b/third-party/zstd/lib/libzstd.mk
    index 5e11d5d2..a308a6ef 100644
    --- a/third-party/zstd/lib/libzstd.mk
    +++ b/third-party/zstd/lib/libzstd.mk
    @@ -8,12 +8,21 @@
     # You may select, at your option, one of the above-listed licenses.
     # ################################################################
     
    +# This included Makefile provides the following variables :
    +# LIB_SRCDIR, LIB_BINDIR
    +
    +# Ensure the file is not included twice
    +# Note : must be included after setting the default target
    +ifndef LIBZSTD_MK_INCLUDED
    +LIBZSTD_MK_INCLUDED := 1
    +
     ##################################################################
     # Input Variables
     ##################################################################
     
    -# Zstd lib directory
    -LIBZSTD ?= ./
    +# By default, library's directory is same as this included makefile
    +LIB_SRCDIR ?= $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
    +LIB_BINDIR ?= $(LIBSRC_DIR)
     
     # ZSTD_LIB_MINIFY is a helper variable that
     # configures a bunch of other variables to space-optimized defaults.
    @@ -47,6 +56,9 @@ endif
     # Assembly support
     ZSTD_NO_ASM ?= 0
     
    +ZSTD_LIB_EXCLUDE_COMPRESSORS_DFAST_AND_UP ?= 0
    +ZSTD_LIB_EXCLUDE_COMPRESSORS_GREEDY_AND_UP ?= 0
    +
     ##################################################################
     # libzstd helpers
     ##################################################################
    @@ -57,6 +69,7 @@ VOID ?= /dev/null
     NUM_SYMBOL := \#
     
     # define silent mode as default (verbose mode with V=1 or VERBOSE=1)
    +# Note : must be defined _after_ the default target
     $(V)$(VERBOSE).SILENT:
     
     # When cross-compiling from linux to windows,
    @@ -66,7 +79,7 @@ $(V)$(VERBOSE).SILENT:
     TARGET_SYSTEM ?= $(OS)
     
     # Version numbers
    -LIBVER_SRC := $(LIBZSTD)/zstd.h
    +LIBVER_SRC := $(LIB_SRCDIR)/zstd.h
     LIBVER_MAJOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MAJOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
     LIBVER_MINOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MINOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
     LIBVER_PATCH_SCRIPT:=`sed -n '/define ZSTD_VERSION_RELEASE/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
    @@ -133,14 +146,14 @@ ifeq ($(HAVE_COLORNEVER), 1)
     endif
     GREP = grep $(GREP_OPTIONS)
     
    -ZSTD_COMMON_FILES := $(sort $(wildcard $(LIBZSTD)/common/*.c))
    -ZSTD_COMPRESS_FILES := $(sort $(wildcard $(LIBZSTD)/compress/*.c))
    -ZSTD_DECOMPRESS_FILES := $(sort $(wildcard $(LIBZSTD)/decompress/*.c))
    -ZSTD_DICTBUILDER_FILES := $(sort $(wildcard $(LIBZSTD)/dictBuilder/*.c))
    -ZSTD_DEPRECATED_FILES := $(sort $(wildcard $(LIBZSTD)/deprecated/*.c))
    +ZSTD_COMMON_FILES := $(sort $(wildcard $(LIB_SRCDIR)/common/*.c))
    +ZSTD_COMPRESS_FILES := $(sort $(wildcard $(LIB_SRCDIR)/compress/*.c))
    +ZSTD_DECOMPRESS_FILES := $(sort $(wildcard $(LIB_SRCDIR)/decompress/*.c))
    +ZSTD_DICTBUILDER_FILES := $(sort $(wildcard $(LIB_SRCDIR)/dictBuilder/*.c))
    +ZSTD_DEPRECATED_FILES := $(sort $(wildcard $(LIB_SRCDIR)/deprecated/*.c))
     ZSTD_LEGACY_FILES :=
     
    -ZSTD_DECOMPRESS_AMD64_ASM_FILES := $(sort $(wildcard $(LIBZSTD)/decompress/*_amd64.S))
    +ZSTD_DECOMPRESS_AMD64_ASM_FILES := $(sort $(wildcard $(LIB_SRCDIR)/decompress/*_amd64.S))
     
     ifneq ($(ZSTD_NO_ASM), 0)
       CPPFLAGS += -DZSTD_DISABLE_ASM
    @@ -178,9 +191,17 @@ ifneq ($(ZSTD_LEGACY_MULTITHREADED_API), 0)
       CFLAGS += -DZSTD_LEGACY_MULTITHREADED_API
     endif
     
    +ifneq ($(ZSTD_LIB_EXCLUDE_COMPRESSORS_DFAST_AND_UP), 0)
    +  CFLAGS += -DZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR -DZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR -DZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR -DZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR -DZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR -DZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
    +else
    +ifneq ($(ZSTD_LIB_EXCLUDE_COMPRESSORS_GREEDY_AND_UP), 0)
    +  CFLAGS += -DZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR -DZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR -DZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR -DZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR -DZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
    +endif
    +endif
    +
     ifneq ($(ZSTD_LEGACY_SUPPORT), 0)
     ifeq ($(shell test $(ZSTD_LEGACY_SUPPORT) -lt 8; echo $$?), 0)
    -  ZSTD_LEGACY_FILES += $(shell ls $(LIBZSTD)/legacy/*.c | $(GREP) 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
    +  ZSTD_LEGACY_FILES += $(shell ls $(LIB_SRCDIR)/legacy/*.c | $(GREP) 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
     endif
     endif
     CPPFLAGS  += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
    @@ -209,6 +230,8 @@ ifeq ($(HAVE_HASH),0)
     endif
     endif # BUILD_DIR
     
    -ZSTD_SUBDIR := $(LIBZSTD)/common $(LIBZSTD)/compress $(LIBZSTD)/decompress $(LIBZSTD)/dictBuilder $(LIBZSTD)/legacy $(LIBZSTD)/deprecated
    +ZSTD_SUBDIR := $(LIB_SRCDIR)/common $(LIB_SRCDIR)/compress $(LIB_SRCDIR)/decompress $(LIB_SRCDIR)/dictBuilder $(LIB_SRCDIR)/legacy $(LIB_SRCDIR)/deprecated
     vpath %.c $(ZSTD_SUBDIR)
     vpath %.S $(ZSTD_SUBDIR)
    +
    +endif # LIBZSTD_MK_INCLUDED
    diff --git a/third-party/zstd/lib/zstd.h b/third-party/zstd/lib/zstd.h
    index e5c3f8b6..5d1fef8a 100644
    --- a/third-party/zstd/lib/zstd.h
    +++ b/third-party/zstd/lib/zstd.h
    @@ -106,7 +106,7 @@ extern "C" {
     /*------   Version   ------*/
     #define ZSTD_VERSION_MAJOR    1
     #define ZSTD_VERSION_MINOR    5
    -#define ZSTD_VERSION_RELEASE  5
    +#define ZSTD_VERSION_RELEASE  6
     #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
     
     /*! ZSTD_versionNumber() :
    @@ -228,7 +228,7 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
      * for example to size a static array on stack.
      * Will produce constant value 0 if srcSize too large.
      */
    -#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U)
    +#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U)
     #define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
     ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
     /* ZSTD_isError() :
    @@ -249,7 +249,7 @@ ZSTDLIB_API int         ZSTD_defaultCLevel(void);           /*!< default compres
     /*= Compression context
      *  When compressing many times,
      *  it is recommended to allocate a context just once,
    - *  and re-use it for each successive compression operation.
    + *  and reuse it for each successive compression operation.
      *  This will make workload friendlier for system's memory.
      *  Note : re-using context is just a speed / resource optimization.
      *         It doesn't change the compression ratio, which remains identical.
    @@ -262,9 +262,9 @@ ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);  /* accept NULL pointer *
     
     /*! ZSTD_compressCCtx() :
      *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
    - *  Important : in order to behave similarly to `ZSTD_compress()`,
    - *  this function compresses at requested compression level,
    - *  __ignoring any other parameter__ .
    + *  Important : in order to mirror `ZSTD_compress()` behavior,
    + *  this function compresses at the requested compression level,
    + *  __ignoring any other advanced parameter__ .
      *  If any advanced parameter was set using the advanced API,
      *  they will all be reset. Only `compressionLevel` remains.
      */
    @@ -276,7 +276,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
     /*= Decompression context
      *  When decompressing many times,
      *  it is recommended to allocate a context only once,
    - *  and re-use it for each successive compression operation.
    + *  and reuse it for each successive compression operation.
      *  This will make workload friendlier for system's memory.
      *  Use one context per thread for parallel execution. */
     typedef struct ZSTD_DCtx_s ZSTD_DCtx;
    @@ -286,7 +286,7 @@ ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);  /* accept NULL pointer *
     /*! ZSTD_decompressDCtx() :
      *  Same as ZSTD_decompress(),
      *  requires an allocated ZSTD_DCtx.
    - *  Compatible with sticky parameters.
    + *  Compatible with sticky parameters (see below).
      */
     ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
                                            void* dst, size_t dstCapacity,
    @@ -302,12 +302,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
      *   using ZSTD_CCtx_set*() functions.
      *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
      *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
    - *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
    + *   __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ .
      *
      *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
      *
      *   This API supersedes all other "advanced" API entry points in the experimental section.
    - *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
    + *   In the future, we expect to remove API entry points from experimental which are redundant with this API.
      */
     
     
    @@ -390,6 +390,19 @@ typedef enum {
                                   * The higher the value of selected strategy, the more complex it is,
                                   * resulting in stronger and slower compression.
                                   * Special: value 0 means "use default strategy". */
    +
    +    ZSTD_c_targetCBlockSize=130, /* v1.5.6+
    +                                  * Attempts to fit compressed block size into approximatively targetCBlockSize.
    +                                  * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX.
    +                                  * Note that it's not a guarantee, just a convergence target (default:0).
    +                                  * No target when targetCBlockSize == 0.
    +                                  * This is helpful in low bandwidth streaming environments to improve end-to-end latency,
    +                                  * when a client can make use of partial documents (a prominent example being Chrome).
    +                                  * Note: this parameter is stable since v1.5.6.
    +                                  * It was present as an experimental parameter in earlier versions,
    +                                  * but it's not recommended using it with earlier library versions
    +                                  * due to massive performance regressions.
    +                                  */
         /* LDM mode parameters */
         ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
                                          * This parameter is designed to improve compression ratio
    @@ -469,7 +482,6 @@ typedef enum {
          * ZSTD_c_forceMaxWindow
          * ZSTD_c_forceAttachDict
          * ZSTD_c_literalCompressionMode
    -     * ZSTD_c_targetCBlockSize
          * ZSTD_c_srcSizeHint
          * ZSTD_c_enableDedicatedDictSearch
          * ZSTD_c_stableInBuffer
    @@ -490,7 +502,7 @@ typedef enum {
          ZSTD_c_experimentalParam3=1000,
          ZSTD_c_experimentalParam4=1001,
          ZSTD_c_experimentalParam5=1002,
    -     ZSTD_c_experimentalParam6=1003,
    +     /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */
          ZSTD_c_experimentalParam7=1004,
          ZSTD_c_experimentalParam8=1005,
          ZSTD_c_experimentalParam9=1006,
    @@ -575,6 +587,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
     
     /*! ZSTD_compress2() :
      *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
    + *  (note that this entry point doesn't even expose a compression level parameter).
      *  ZSTD_compress2() always starts a new frame.
      *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
      *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
    @@ -618,6 +631,7 @@ typedef enum {
          * ZSTD_d_forceIgnoreChecksum
          * ZSTD_d_refMultipleDDicts
          * ZSTD_d_disableHuffmanAssembly
    +     * ZSTD_d_maxBlockSize
          * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
          * note : never ever use experimentalParam? names directly
          */
    @@ -625,7 +639,8 @@ typedef enum {
          ZSTD_d_experimentalParam2=1001,
          ZSTD_d_experimentalParam3=1002,
          ZSTD_d_experimentalParam4=1003,
    -     ZSTD_d_experimentalParam5=1004
    +     ZSTD_d_experimentalParam5=1004,
    +     ZSTD_d_experimentalParam6=1005
     
     } ZSTD_dParameter;
     
    @@ -680,14 +695,14 @@ typedef struct ZSTD_outBuffer_s {
     *  A ZSTD_CStream object is required to track streaming operation.
     *  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
     *  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
    -*  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
    +*  It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
     *
     *  For parallel execution, use one separate ZSTD_CStream per thread.
     *
     *  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
     *
     *  Parameters are sticky : when starting a new compression on the same context,
    -*  it will re-use the same sticky parameters as previous compression session.
    +*  it will reuse the same sticky parameters as previous compression session.
     *  When in doubt, it's recommended to fully initialize the context before usage.
     *  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
     *  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
    @@ -776,6 +791,11 @@ typedef enum {
      *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
      *            Before starting a new compression job, or changing compression parameters,
      *            it is required to fully flush internal buffers.
    + *  - note: if an operation ends with an error, it may leave @cctx in an undefined state.
    + *          Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state.
    + *          In order to be re-employed after an error, a state must be reset,
    + *          which can be done explicitly (ZSTD_CCtx_reset()),
    + *          or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx())
      */
     ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
                                              ZSTD_outBuffer* output,
    @@ -835,7 +855,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
     *
     *  A ZSTD_DStream object is required to track streaming operations.
     *  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
    -*  ZSTD_DStream objects can be re-used multiple times.
    +*  ZSTD_DStream objects can be reused multiple times.
     *
     *  Use ZSTD_initDStream() to start a new decompression operation.
     * @return : recommended first input size
    @@ -889,6 +909,12 @@ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
      * @return : 0 when a frame is completely decoded and fully flushed,
      *           or an error code, which can be tested using ZSTD_isError(),
      *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
    + *
    + * Note: when an operation returns with an error code, the @zds state may be left in undefined state.
    + *       It's UB to invoke `ZSTD_decompressStream()` on such a state.
    + *       In order to re-use such a state, it must be first reset,
    + *       which can be done explicitly (`ZSTD_DCtx_reset()`),
    + *       or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`)
      */
     ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
     
    @@ -1021,7 +1047,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
      *
      * This API allows dictionaries to be used with ZSTD_compress2(),
      * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
    - * Dictionaries are sticky, they remain valid when same context is re-used,
    + * Dictionaries are sticky, they remain valid when same context is reused,
      * they only reset when the context is reset
      * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
      * In contrast, Prefixes are single-use.
    @@ -1239,7 +1265,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
     #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
     
     /* Advanced parameter bounds */
    -#define ZSTD_TARGETCBLOCKSIZE_MIN   64
    +#define ZSTD_TARGETCBLOCKSIZE_MIN   1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */
     #define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
     #define ZSTD_SRCSIZEHINT_MIN        0
     #define ZSTD_SRCSIZEHINT_MAX        INT_MAX
    @@ -1527,25 +1553,38 @@ typedef enum {
     ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
     
     /*! ZSTD_generateSequences() :
    + * WARNING: This function is meant for debugging and informational purposes ONLY!
    + * Its implementation is flawed, and it will be deleted in a future version.
    + * It is not guaranteed to succeed, as there are several cases where it will give
    + * up and fail. You should NOT use this function in production code.
    + *
    + * This function is deprecated, and will be removed in a future version.
    + *
      * Generate sequences using ZSTD_compress2(), given a source buffer.
      *
    + * @param zc The compression context to be used for ZSTD_compress2(). Set any
    + *           compression parameters you need on this context.
    + * @param outSeqs The output sequences buffer of size @p outSeqsSize
    + * @param outSeqsSize The size of the output sequences buffer.
    + *                    ZSTD_sequenceBound(srcSize) is an upper bound on the number
    + *                    of sequences that can be generated.
    + * @param src The source buffer to generate sequences from of size @p srcSize.
    + * @param srcSize The size of the source buffer.
    + *
      * Each block will end with a dummy sequence
      * with offset == 0, matchLength == 0, and litLength == length of last literals.
      * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
      * simply acts as a block delimiter.
      *
    - * @zc can be used to insert custom compression params.
    - * This function invokes ZSTD_compress2().
    - *
    - * The output of this function can be fed into ZSTD_compressSequences() with CCtx
    - * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
    - * @return : number of sequences generated
    + * @returns The number of sequences generated, necessarily less than
    + *          ZSTD_sequenceBound(srcSize), or an error code that can be checked
    + *          with ZSTD_isError().
      */
    -
    +ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()")
     ZSTDLIB_STATIC_API size_t
    -ZSTD_generateSequences( ZSTD_CCtx* zc,
    -                        ZSTD_Sequence* outSeqs, size_t outSeqsSize,
    -                        const void* src, size_t srcSize);
    +ZSTD_generateSequences(ZSTD_CCtx* zc,
    +                       ZSTD_Sequence* outSeqs, size_t outSeqsSize,
    +                       const void* src, size_t srcSize);
     
     /*! ZSTD_mergeBlockDelimiters() :
      * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
    @@ -1640,56 +1679,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
     /*! ZSTD_estimate*() :
      *  These functions make it possible to estimate memory usage
      *  of a future {D,C}Ctx, before its creation.
    + *  This is useful in combination with ZSTD_initStatic(),
    + *  which makes it possible to employ a static buffer for ZSTD_CCtx* state.
      *
      *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
    - *  for any compression level up to selected one.
    - *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
    - *         does not include space for a window buffer.
    - *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
    + *  to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2()
    + *  associated with any compression level up to max specified one.
      *  The estimate will assume the input may be arbitrarily large,
      *  which is the worst case.
      *
    + *  Note that the size estimation is specific for one-shot compression,
    + *  it is not valid for streaming (see ZSTD_estimateCStreamSize*())
    + *  nor other potential ways of using a ZSTD_CCtx* state.
    + *
      *  When srcSize can be bound by a known and rather "small" value,
    - *  this fact can be used to provide a tighter estimation
    - *  because the CCtx compression context will need less memory.
    - *  This tighter estimation can be provided by more advanced functions
    + *  this knowledge can be used to provide a tighter budget estimation
    + *  because the ZSTD_CCtx* state will need less memory for small inputs.
    + *  This tighter estimation can be provided by employing more advanced functions
      *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
      *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
      *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
      *
      *  Note : only single-threaded compression is supported.
      *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
    - *
    - *  Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
    - *  Size estimates assume that no external sequence producer is registered.
      */
    -ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
    +ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel);
     ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
     ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
     ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
     
     /*! ZSTD_estimateCStreamSize() :
    - *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
    - *  It will also consider src size to be arbitrarily "large", which is worst case.
    + *  ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression
    + *  using any compression level up to the max specified one.
    + *  It will also consider src size to be arbitrarily "large", which is a worst case scenario.
      *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
      *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
      *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
      *  Note : CStream size estimation is only correct for single-threaded compression.
    - *  ZSTD_DStream memory budget depends on window Size.
    + *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
    + *  Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
    + *  Size estimates assume that no external sequence producer is registered.
    + *
    + *  ZSTD_DStream memory budget depends on frame's window Size.
      *  This information can be passed manually, using ZSTD_estimateDStreamSize,
      *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
    + *  Any frame requesting a window size larger than max specified one will be rejected.
      *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
      *         an internal ?Dict will be created, which additional size is not estimated here.
      *         In this case, get total size by adding ZSTD_estimate?DictSize
    - *  Note 2 : only single-threaded compression is supported.
    - *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
    - *  Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
    - *  Size estimates assume that no external sequence producer is registered.
      */
    -ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
    +ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel);
     ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
     ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
    -ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
    +ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize);
     ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
     
     /*! ZSTD_estimate?DictSize() :
    @@ -1946,11 +1988,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
      */
     #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
     
    -/* Tries to fit compressed block size to be around targetCBlockSize.
    - * No target when targetCBlockSize == 0.
    - * There is no guarantee on compressed block size (default:0) */
    -#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
    -
     /* User's best guess of source size.
      * Hint is not valid when srcSizeHint == 0.
      * There is no guarantee that hint is close to actual source size,
    @@ -2430,6 +2467,22 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
      */
     #define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
     
    +/* ZSTD_d_maxBlockSize
    + * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
    + * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
    + *
    + * Forces the decompressor to reject blocks whose content size is
    + * larger than the configured maxBlockSize. When maxBlockSize is
    + * larger than the windowSize, the windowSize is used instead.
    + * This saves memory on the decoder when you know all blocks are small.
    + *
    + * This option is typically used in conjunction with ZSTD_c_maxBlockSize.
    + *
    + * WARNING: This causes the decoder to reject otherwise valid frames
    + * that have block sizes larger than the configured maxBlockSize.
    + */
    +#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6
    +
     
     /*! ZSTD_DCtx_setFormat() :
      *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
    @@ -2557,7 +2610,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
      *       explicitly specified.
      *
      *  start a new frame, using same parameters from previous frame.
    - *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
    + *  This is typically useful to skip dictionary loading stage, since it will reuse it in-place.
      *  Note that zcs must be init at least once before using ZSTD_resetCStream().
      *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
      *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
    @@ -2633,7 +2686,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z
      *
      *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
      *
    - * re-use decompression parameters from previous init; saves dictionary loading
    + * reuse decompression parameters from previous init; saves dictionary loading
      */
     ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions")
     ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
    @@ -2765,7 +2818,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
     
     #define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
     
    -typedef size_t ZSTD_sequenceProducer_F (
    +typedef size_t (*ZSTD_sequenceProducer_F) (
       void* sequenceProducerState,
       ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
       const void* src, size_t srcSize,
    @@ -2797,7 +2850,23 @@ ZSTDLIB_STATIC_API void
     ZSTD_registerSequenceProducer(
       ZSTD_CCtx* cctx,
       void* sequenceProducerState,
    -  ZSTD_sequenceProducer_F* sequenceProducer
    +  ZSTD_sequenceProducer_F sequenceProducer
    +);
    +
    +/*! ZSTD_CCtxParams_registerSequenceProducer() :
    + * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params.
    + * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(),
    + * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx().
    + *
    + * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx()
    + * is required, then this function is for you. Otherwise, you probably don't need it.
    + *
    + * See tests/zstreamtest.c for example usage. */
    +ZSTDLIB_STATIC_API void
    +ZSTD_CCtxParams_registerSequenceProducer(
    +  ZSTD_CCtx_params* params,
    +  void* sequenceProducerState,
    +  ZSTD_sequenceProducer_F sequenceProducer
     );
     
     
    @@ -2820,7 +2889,7 @@ ZSTD_registerSequenceProducer(
     
       A ZSTD_CCtx object is required to track streaming operations.
       Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
    -  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
    +  ZSTD_CCtx object can be reused multiple times within successive compression operations.
     
       Start by initializing a context.
       Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
    @@ -2841,7 +2910,7 @@ ZSTD_registerSequenceProducer(
       It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
       Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
     
    -  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
    +  `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again.
     */
     
     /*=====   Buffer-less streaming compression functions  =====*/
    @@ -2873,7 +2942,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
     
       A ZSTD_DCtx object is required to track streaming operations.
       Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
    -  A ZSTD_DCtx object can be re-used multiple times.
    +  A ZSTD_DCtx object can be reused multiple times.
     
       First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
       Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
    diff --git a/third-party/zstd/programs/.gitignore b/third-party/zstd/programs/.gitignore
    index 2d4edbe4..42a7e30d 100644
    --- a/third-party/zstd/programs/.gitignore
    +++ b/third-party/zstd/programs/.gitignore
    @@ -9,6 +9,8 @@ zstd-small
     zstd-nolegacy
     zstd-dictBuilder
     zstd-dll
    +zstd_arm64
    +zstd_x64
     
     # Object files
     *.o
    diff --git a/third-party/zstd/programs/Makefile b/third-party/zstd/programs/Makefile
    index 8507abef..4dcd8410 100644
    --- a/third-party/zstd/programs/Makefile
    +++ b/third-party/zstd/programs/Makefile
    @@ -15,12 +15,11 @@
     # zstd-decompress : decompressor-only version of zstd
     # ##########################################################################
     
    -.PHONY: default
    -default: zstd-release
    +# default target (when running `make` with no argument)
    +zstd-release:
     
    -LIBZSTD := ../lib
    -
    -include $(LIBZSTD)/libzstd.mk
    +LIBZSTD_MK_DIR = ../lib
    +include $(LIBZSTD_MK_DIR)/libzstd.mk
     
     ifeq ($(shell $(CC) -v 2>&1 | $(GREP) -c "gcc version "), 1)
       ALIGN_LOOP = -falign-loops=32
    @@ -223,7 +222,7 @@ zstd-noxz : zstd
     
     ## zstd-dll: zstd executable linked to dynamic library libzstd (must have same version)
     .PHONY: zstd-dll
    -zstd-dll : LDFLAGS+= -L$(LIBZSTD)
    +zstd-dll : LDFLAGS+= -L$(LIB_BINDIR)
     zstd-dll : LDLIBS += -lzstd
     zstd-dll : ZSTDLIB_LOCAL_SRC = xxhash.c pool.c threading.c
     zstd-dll : zstd
    @@ -346,7 +345,7 @@ include $(wildcard $(DEPFILES))
     #-----------------------------------------------------------------------------
     # make install is validated only for Linux, macOS, BSD, Hurd and Solaris targets
     #-----------------------------------------------------------------------------
    -ifneq (,$(filter $(UNAME),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS Haiku AIX))
    +ifneq (,$(filter $(UNAME),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS Haiku AIX MSYS_NT CYGWIN_NT))
     
     HAVE_COLORNEVER = $(shell echo a | egrep --color=never a > /dev/null 2> /dev/null && echo 1 || echo 0)
     EGREP_OPTIONS ?=
    diff --git a/third-party/zstd/programs/benchfn.c b/third-party/zstd/programs/benchfn.c
    index 8e6726f8..3e042cf3 100644
    --- a/third-party/zstd/programs/benchfn.c
    +++ b/third-party/zstd/programs/benchfn.c
    @@ -108,7 +108,6 @@ static BMK_runOutcome_t BMK_setValid_runTime(BMK_runTime_t runTime)
     BMK_runOutcome_t BMK_benchFunction(BMK_benchParams_t p,
                                        unsigned nbLoops)
     {
    -    size_t dstSize = 0;
         nbLoops += !nbLoops;   /* minimum nbLoops is 1 */
     
         /* init */
    @@ -118,7 +117,8 @@ BMK_runOutcome_t BMK_benchFunction(BMK_benchParams_t p,
         }   }
     
         /* benchmark */
    -    {   UTIL_time_t const clockStart = UTIL_getTime();
    +    {   size_t dstSize = 0;
    +        UTIL_time_t const clockStart = UTIL_getTime();
             unsigned loopNb, blockNb;
             if (p.initFn != NULL) p.initFn(p.initPayload);
             for (loopNb = 0; loopNb < nbLoops; loopNb++) {
    diff --git a/third-party/zstd/programs/benchzstd.c b/third-party/zstd/programs/benchzstd.c
    index 9bc3628e..29ee595c 100644
    --- a/third-party/zstd/programs/benchzstd.c
    +++ b/third-party/zstd/programs/benchzstd.c
    @@ -8,197 +8,286 @@
      * You may select, at your option, one of the above-listed licenses.
      */
     
    -
     /* **************************************
    -*  Tuning parameters
    -****************************************/
    -#ifndef BMK_TIMETEST_DEFAULT_S   /* default minimum time per test */
    -# define BMK_TIMETEST_DEFAULT_S 3
    + *  Tuning parameters
    + ****************************************/
    +#ifndef BMK_TIMETEST_DEFAULT_S /* default minimum time per test */
    +#    define BMK_TIMETEST_DEFAULT_S 3
     #endif
     
    -
     /* *************************************
    -*  Includes
    -***************************************/
    -#include "platform.h"    /* Large Files support */
    -#include "util.h"        /* UTIL_getFileSize, UTIL_sleep */
    -#include       /* malloc, free */
    -#include       /* memset, strerror */
    -#include        /* fprintf, fopen */
    -#include 
    -#include       /* assert */
    + *  Includes
    + ***************************************/
    +/* this must be included first */
    +#include "platform.h" /* Large Files support, compiler specifics */
     
    -#include "timefn.h"      /* UTIL_time_t */
    -#include "benchfn.h"
    +/* then following system includes */
    +#include  /* assert */
    +#include 
    +#include     /* fprintf, fopen */
    +#include    /* malloc, free */
    +#include    /* memset, strerror */
    +#include "util.h"     /* UTIL_getFileSize, UTIL_sleep */
     #include "../lib/common/mem.h"
    +#include "benchfn.h"
    +#include "timefn.h" /* UTIL_time_t */
     #ifndef ZSTD_STATIC_LINKING_ONLY
    -#define ZSTD_STATIC_LINKING_ONLY
    +#    define ZSTD_STATIC_LINKING_ONLY
     #endif
     #include "../lib/zstd.h"
    -#include "datagen.h"     /* RDG_genBuffer */
    +#include "datagen.h" /* RDG_genBuffer */
    +#include "lorem.h"   /* LOREM_genBuffer */
     #ifndef XXH_INLINE_ALL
    -#define XXH_INLINE_ALL
    +#    define XXH_INLINE_ALL
     #endif
     #include "../lib/common/xxhash.h"
    -#include "benchzstd.h"
     #include "../lib/zstd_errors.h"
    -
    +#include "benchzstd.h"
     
     /* *************************************
    -*  Constants
    -***************************************/
    + *  Constants
    + ***************************************/
     #ifndef ZSTD_GIT_COMMIT
    -#  define ZSTD_GIT_COMMIT_STRING ""
    +#    define ZSTD_GIT_COMMIT_STRING ""
     #else
    -#  define ZSTD_GIT_COMMIT_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_GIT_COMMIT)
    +#    define ZSTD_GIT_COMMIT_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_GIT_COMMIT)
     #endif
     
    -#define TIMELOOP_MICROSEC     (1*1000000ULL) /* 1 second */
    -#define TIMELOOP_NANOSEC      (1*1000000000ULL) /* 1 second */
    -#define ACTIVEPERIOD_MICROSEC (70*TIMELOOP_MICROSEC) /* 70 seconds */
    -#define COOLPERIOD_SEC        10
    +#define TIMELOOP_MICROSEC (1 * 1000000ULL)             /* 1 second */
    +#define TIMELOOP_NANOSEC (1 * 1000000000ULL)           /* 1 second */
    +#define ACTIVEPERIOD_MICROSEC (70 * TIMELOOP_MICROSEC) /* 70 seconds */
    +#define COOLPERIOD_SEC 10
     
    -#define KB *(1 <<10)
    -#define MB *(1 <<20)
    -#define GB *(1U<<30)
    +#define KB *(1 << 10)
    +#define MB *(1 << 20)
    +#define GB *(1U << 30)
     
     #define BMK_RUNTEST_DEFAULT_MS 1000
     
    -static const size_t maxMemory = (sizeof(size_t)==4)  ?
    -                    /* 32-bit */ (2 GB - 64 MB) :
    -                    /* 64-bit */ (size_t)(1ULL << ((sizeof(size_t)*8)-31));
    -
    +static const size_t maxMemory = (sizeof(size_t) == 4)
    +        ?
    +        /* 32-bit */ (2 GB - 64 MB)
    +        :
    +        /* 64-bit */ (size_t)(1ULL << ((sizeof(size_t) * 8) - 31));
     
     /* *************************************
    -*  console display
    -***************************************/
    -#define DISPLAY(...)         { fprintf(stderr, __VA_ARGS__); fflush(NULL); }
    -#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
    -/* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
    -#define OUTPUT(...)          { fprintf(stdout, __VA_ARGS__); fflush(NULL); }
    -#define OUTPUTLEVEL(l, ...)  if (displayLevel>=l) { OUTPUT(__VA_ARGS__); }
    -
    + *  console display
    + ***************************************/
    +#define DISPLAY(...)                  \
    +    {                                 \
    +        fprintf(stderr, __VA_ARGS__); \
    +        fflush(NULL);                 \
    +    }
    +#define DISPLAYLEVEL(l, ...)  \
    +    if (displayLevel >= l) {  \
    +        DISPLAY(__VA_ARGS__); \
    +    }
    +/* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : +
    + * progression;   4 : + information */
    +#define OUTPUT(...)                   \
    +    {                                 \
    +        fprintf(stdout, __VA_ARGS__); \
    +        fflush(NULL);                 \
    +    }
    +#define OUTPUTLEVEL(l, ...)  \
    +    if (displayLevel >= l) { \
    +        OUTPUT(__VA_ARGS__); \
    +    }
     
     /* *************************************
    -*  Exceptions
    -***************************************/
    + *  Exceptions
    + ***************************************/
     #ifndef DEBUG
    -#  define DEBUG 0
    +#    define DEBUG 0
     #endif
    -#define DEBUGOUTPUT(...) { if (DEBUG) DISPLAY(__VA_ARGS__); }
    -
    -#define RETURN_ERROR_INT(errorNum, ...)  {               \
    -    DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);    \
    -    DISPLAYLEVEL(1, "Error %i : ", errorNum);         \
    -    DISPLAYLEVEL(1, __VA_ARGS__);                     \
    -    DISPLAYLEVEL(1, " \n");                           \
    -    return errorNum;                                  \
    -}
    +#define DEBUGOUTPUT(...)          \
    +    {                             \
    +        if (DEBUG)                \
    +            DISPLAY(__VA_ARGS__); \
    +    }
     
    -#define CHECK_Z(zf) {              \
    -    size_t const zerr = zf;        \
    -    if (ZSTD_isError(zerr)) {      \
    -        DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);  \
    -        DISPLAY("Error : ");       \
    -        DISPLAY("%s failed : %s",  \
    -                #zf, ZSTD_getErrorName(zerr));   \
    -        DISPLAY(" \n");            \
    -        exit(1);                   \
    -    }                              \
    -}
    +#define RETURN_ERROR_INT(errorNum, ...)                \
    +    {                                                  \
    +        DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__); \
    +        DISPLAYLEVEL(1, "Error %i : ", errorNum);      \
    +        DISPLAYLEVEL(1, __VA_ARGS__);                  \
    +        DISPLAYLEVEL(1, " \n");                        \
    +        return errorNum;                               \
    +    }
     
    -#define RETURN_ERROR(errorNum, retType, ...)  {       \
    -    retType r;                                        \
    -    memset(&r, 0, sizeof(retType));                   \
    -    DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);    \
    -    DISPLAYLEVEL(1, "Error %i : ", errorNum);         \
    -    DISPLAYLEVEL(1, __VA_ARGS__);                     \
    -    DISPLAYLEVEL(1, " \n");                           \
    -    r.tag = errorNum;                                 \
    -    return r;                                         \
    -}
    +#define CHECK_Z(zf)                                                  \
    +    {                                                                \
    +        size_t const zerr = zf;                                      \
    +        if (ZSTD_isError(zerr)) {                                    \
    +            DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);           \
    +            DISPLAY("Error : ");                                     \
    +            DISPLAY("%s failed : %s", #zf, ZSTD_getErrorName(zerr)); \
    +            DISPLAY(" \n");                                          \
    +            exit(1);                                                 \
    +        }                                                            \
    +    }
    +
    +#define RETURN_ERROR(errorNum, retType, ...)           \
    +    {                                                  \
    +        retType r;                                     \
    +        memset(&r, 0, sizeof(retType));                \
    +        DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__); \
    +        DISPLAYLEVEL(1, "Error %i : ", errorNum);      \
    +        DISPLAYLEVEL(1, __VA_ARGS__);                  \
    +        DISPLAYLEVEL(1, " \n");                        \
    +        r.tag = errorNum;                              \
    +        return r;                                      \
    +    }
    +
    +/* replacement for snprintf(), which is not supported by C89
    + * sprintf() would be the supported one, but it's labelled unsafe,
    + * so some modern static analyzer will flag it as such, making it unusable.
    + * formatString_u() replaces snprintf() for the specific case where there are only %u arguments */
    +static int formatString_u(char* buffer, size_t buffer_size, const char* formatString, unsigned int value)
    +{
    +    size_t written = 0;
    +    int i;
    +    assert(value <= 100);
     
    +    for (i = 0; formatString[i] != '\0' && written < buffer_size - 1; ++i) {
    +        if (formatString[i] != '%') {
    +            buffer[written++] = formatString[i];
    +            continue;
    +        }
    +
    +        if (formatString[++i] == 'u') {
    +            /* Handle single digit */
    +            if (value < 10) {
    +                buffer[written++] = '0' + (char)value;
    +            } else if (value < 100) {
    +                /* Handle two digits */
    +                if (written >= buffer_size - 2) {
    +                    return -1; /* buffer overflow */
    +                }
    +                buffer[written++] = '0' + (char)(value / 10);
    +                buffer[written++] = '0' + (char)(value % 10);
    +            } else { /* 100 */
    +                if (written >= buffer_size - 3) {
    +                    return -1; /* buffer overflow */
    +                }
    +                buffer[written++] = '1';
    +                buffer[written++] = '0';
    +                buffer[written++] = '0';
    +            }
    +        } else if (formatString[i] == '%') { /* Check for escaped percent sign */
    +            buffer[written++] = '%';
    +        } else {
    +            return -1; /* unsupported format */
    +        }
    +    }
    +
    +    if (written < buffer_size) {
    +        buffer[written] = '\0';
    +    } else {
    +        buffer[0] = '\0'; /* Handle truncation */
    +    }
    +
    +    return (int)written;
    +}
     
     /* *************************************
    -*  Benchmark Parameters
    -***************************************/
    + *  Benchmark Parameters
    + ***************************************/
     
    -BMK_advancedParams_t BMK_initAdvancedParams(void) {
    +BMK_advancedParams_t BMK_initAdvancedParams(void)
    +{
         BMK_advancedParams_t const res = {
    -        BMK_both, /* mode */
    +        BMK_both,               /* mode */
             BMK_TIMETEST_DEFAULT_S, /* nbSeconds */
    -        0, /* blockSize */
    -        0, /* nbWorkers */
    -        0, /* realTime */
    -        0, /* additionalParam */
    -        0, /* ldmFlag */
    -        0, /* ldmMinMatch */
    -        0, /* ldmHashLog */
    -        0, /* ldmBuckSizeLog */
    -        0,  /* ldmHashRateLog */
    -        ZSTD_ps_auto, /* literalCompressionMode */
    -        0 /* useRowMatchFinder */
    +        0,                      /* blockSize */
    +        0,               /* targetCBlockSize */
    +        0,                      /* nbWorkers */
    +        0,                      /* realTime */
    +        0,                      /* additionalParam */
    +        0,                      /* ldmFlag */
    +        0,                      /* ldmMinMatch */
    +        0,                      /* ldmHashLog */
    +        0,                      /* ldmBuckSizeLog */
    +        0,                      /* ldmHashRateLog */
    +        ZSTD_ps_auto,           /* literalCompressionMode */
    +        0                       /* useRowMatchFinder */
         };
         return res;
     }
     
    -
     /* ********************************************************
    -*  Bench functions
    -**********************************************************/
    + *  Bench functions
    + **********************************************************/
     typedef struct {
         const void* srcPtr;
         size_t srcSize;
    -    void*  cPtr;
    +    void* cPtr;
         size_t cRoom;
         size_t cSize;
    -    void*  resPtr;
    +    void* resPtr;
         size_t resSize;
     } blockParam_t;
     
     #undef MIN
     #undef MAX
    -#define MIN(a,b)    ((a) < (b) ? (a) : (b))
    -#define MAX(a,b)    ((a) > (b) ? (a) : (b))
    -
    -static void
    -BMK_initCCtx(ZSTD_CCtx* ctx,
    -            const void* dictBuffer, size_t dictBufferSize,
    -            int cLevel,
    -            const ZSTD_compressionParameters* comprParams,
    -            const BMK_advancedParams_t* adv)
    +#define MIN(a, b) ((a) < (b) ? (a) : (b))
    +#define MAX(a, b) ((a) > (b) ? (a) : (b))
    +
    +static void BMK_initCCtx(
    +        ZSTD_CCtx* ctx,
    +        const void* dictBuffer,
    +        size_t dictBufferSize,
    +        int cLevel,
    +        const ZSTD_compressionParameters* comprParams,
    +        const BMK_advancedParams_t* adv)
     {
         ZSTD_CCtx_reset(ctx, ZSTD_reset_session_and_parameters);
    -    if (adv->nbWorkers==1) {
    +    if (adv->nbWorkers == 1) {
             CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_nbWorkers, 0));
         } else {
             CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_nbWorkers, adv->nbWorkers));
         }
         CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_compressionLevel, cLevel));
    -    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_useRowMatchFinder, adv->useRowMatchFinder));
    -    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_enableLongDistanceMatching, adv->ldmFlag));
    +    CHECK_Z(ZSTD_CCtx_setParameter(
    +            ctx, ZSTD_c_useRowMatchFinder, adv->useRowMatchFinder));
    +    CHECK_Z(ZSTD_CCtx_setParameter(
    +            ctx, ZSTD_c_enableLongDistanceMatching, adv->ldmFlag));
         CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_ldmMinMatch, adv->ldmMinMatch));
         CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_ldmHashLog, adv->ldmHashLog));
    -    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_ldmBucketSizeLog, adv->ldmBucketSizeLog));
    -    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_ldmHashRateLog, adv->ldmHashRateLog));
    -    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_windowLog, (int)comprParams->windowLog));
    -    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_hashLog, (int)comprParams->hashLog));
    -    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_chainLog, (int)comprParams->chainLog));
    -    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_searchLog, (int)comprParams->searchLog));
    -    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_minMatch, (int)comprParams->minMatch));
    -    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_targetLength, (int)comprParams->targetLength));
    -    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_literalCompressionMode, (int)adv->literalCompressionMode));
    -    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_strategy, (int)comprParams->strategy));
    +    CHECK_Z(ZSTD_CCtx_setParameter(
    +            ctx, ZSTD_c_ldmBucketSizeLog, adv->ldmBucketSizeLog));
    +    CHECK_Z(ZSTD_CCtx_setParameter(
    +            ctx, ZSTD_c_ldmHashRateLog, adv->ldmHashRateLog));
    +    CHECK_Z(ZSTD_CCtx_setParameter(
    +            ctx, ZSTD_c_windowLog, (int)comprParams->windowLog));
    +    CHECK_Z(ZSTD_CCtx_setParameter(
    +            ctx, ZSTD_c_hashLog, (int)comprParams->hashLog));
    +    CHECK_Z(ZSTD_CCtx_setParameter(
    +            ctx, ZSTD_c_chainLog, (int)comprParams->chainLog));
    +    CHECK_Z(ZSTD_CCtx_setParameter(
    +            ctx, ZSTD_c_searchLog, (int)comprParams->searchLog));
    +    CHECK_Z(ZSTD_CCtx_setParameter(
    +            ctx, ZSTD_c_minMatch, (int)comprParams->minMatch));
    +    CHECK_Z(ZSTD_CCtx_setParameter(
    +            ctx, ZSTD_c_targetLength, (int)comprParams->targetLength));
    +    CHECK_Z(ZSTD_CCtx_setParameter(
    +            ctx,
    +            ZSTD_c_literalCompressionMode,
    +            (int)adv->literalCompressionMode));
    +    CHECK_Z(ZSTD_CCtx_setParameter(
    +            ctx, ZSTD_c_strategy, (int)comprParams->strategy));
    +    CHECK_Z(ZSTD_CCtx_setParameter(
    +            ctx, ZSTD_c_targetCBlockSize, (int)adv->targetCBlockSize));
         CHECK_Z(ZSTD_CCtx_loadDictionary(ctx, dictBuffer, dictBufferSize));
     }
     
    -static void BMK_initDCtx(ZSTD_DCtx* dctx,
    -    const void* dictBuffer, size_t dictBufferSize) {
    +static void
    +BMK_initDCtx(ZSTD_DCtx* dctx, const void* dictBuffer, size_t dictBufferSize)
    +{
         CHECK_Z(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_and_parameters));
         CHECK_Z(ZSTD_DCtx_loadDictionary(dctx, dictBuffer, dictBufferSize));
     }
     
    -
     typedef struct {
         ZSTD_CCtx* cctx;
         const void* dictBuffer;
    @@ -208,9 +297,16 @@ typedef struct {
         const BMK_advancedParams_t* adv;
     } BMK_initCCtxArgs;
     
    -static size_t local_initCCtx(void* payload) {
    +static size_t local_initCCtx(void* payload)
    +{
         BMK_initCCtxArgs* ag = (BMK_initCCtxArgs*)payload;
    -    BMK_initCCtx(ag->cctx, ag->dictBuffer, ag->dictBufferSize, ag->cLevel, ag->comprParams, ag->adv);
    +    BMK_initCCtx(
    +            ag->cctx,
    +            ag->dictBuffer,
    +            ag->dictBufferSize,
    +            ag->cLevel,
    +            ag->comprParams,
    +            ag->adv);
         return 0;
     }
     
    @@ -220,18 +316,20 @@ typedef struct {
         size_t dictBufferSize;
     } BMK_initDCtxArgs;
     
    -static size_t local_initDCtx(void* payload) {
    +static size_t local_initDCtx(void* payload)
    +{
         BMK_initDCtxArgs* ag = (BMK_initDCtxArgs*)payload;
         BMK_initDCtx(ag->dctx, ag->dictBuffer, ag->dictBufferSize);
         return 0;
     }
     
    -
     /* `addArgs` is the context */
     static size_t local_defaultCompress(
    -                    const void* srcBuffer, size_t srcSize,
    -                    void* dstBuffer, size_t dstSize,
    -                    void* addArgs)
    +        const void* srcBuffer,
    +        size_t srcSize,
    +        void* dstBuffer,
    +        size_t dstSize,
    +        void* addArgs)
     {
         ZSTD_CCtx* const cctx = (ZSTD_CCtx*)addArgs;
         return ZSTD_compress2(cctx, dstBuffer, dstSize, srcBuffer, srcSize);
    @@ -239,18 +337,24 @@ static size_t local_defaultCompress(
     
     /* `addArgs` is the context */
     static size_t local_defaultDecompress(
    -                    const void* srcBuffer, size_t srcSize,
    -                    void* dstBuffer, size_t dstCapacity,
    -                    void* addArgs)
    +        const void* srcBuffer,
    +        size_t srcSize,
    +        void* dstBuffer,
    +        size_t dstCapacity,
    +        void* addArgs)
     {
    -    size_t moreToFlush = 1;
    +    size_t moreToFlush    = 1;
         ZSTD_DCtx* const dctx = (ZSTD_DCtx*)addArgs;
         ZSTD_inBuffer in;
         ZSTD_outBuffer out;
    -    in.src = srcBuffer; in.size = srcSize; in.pos = 0;
    -    out.dst = dstBuffer; out.size = dstCapacity; out.pos = 0;
    +    in.src   = srcBuffer;
    +    in.size  = srcSize;
    +    in.pos   = 0;
    +    out.dst  = dstBuffer;
    +    out.size = dstCapacity;
    +    out.pos  = 0;
         while (moreToFlush) {
    -        if(out.pos == out.size) {
    +        if (out.pos == out.size) {
                 return (size_t)-ZSTD_error_dstSize_tooSmall;
             }
             moreToFlush = ZSTD_decompressStream(dctx, &out, &in);
    @@ -259,10 +363,8 @@ static size_t local_defaultDecompress(
             }
         }
         return out.pos;
    -
     }
     
    -
     /* ================================================================= */
     /*      Benchmark Zstandard, mem-to-mem scenarios                    */
     /* ================================================================= */
    @@ -286,104 +388,145 @@ static BMK_benchOutcome_t BMK_benchOutcome_error(void)
         return b;
     }
     
    -static BMK_benchOutcome_t BMK_benchOutcome_setValidResult(BMK_benchResult_t result)
    +static BMK_benchOutcome_t BMK_benchOutcome_setValidResult(
    +        BMK_benchResult_t result)
     {
         BMK_benchOutcome_t b;
    -    b.tag = 0;
    +    b.tag                         = 0;
         b.internal_never_use_directly = result;
         return b;
     }
     
    -
     /* benchMem with no allocation */
    -static BMK_benchOutcome_t
    -BMK_benchMemAdvancedNoAlloc(
    -                    const void** srcPtrs, size_t* srcSizes,
    -                    void** cPtrs, size_t* cCapacities, size_t* cSizes,
    -                    void** resPtrs, size_t* resSizes,
    -                    void** resultBufferPtr, void* compressedBuffer,
    -                    size_t maxCompressedSize,
    -                    BMK_timedFnState_t* timeStateCompress,
    -                    BMK_timedFnState_t* timeStateDecompress,
    -
    -                    const void* srcBuffer, size_t srcSize,
    -                    const size_t* fileSizes, unsigned nbFiles,
    -                    const int cLevel,
    -                    const ZSTD_compressionParameters* comprParams,
    -                    const void* dictBuffer, size_t dictBufferSize,
    -                    ZSTD_CCtx* cctx, ZSTD_DCtx* dctx,
    -                    int displayLevel, const char* displayName,
    -                    const BMK_advancedParams_t* adv)
    +static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc(
    +        const void** srcPtrs,
    +        size_t* srcSizes,
    +        void** cPtrs,
    +        size_t* cCapacities,
    +        size_t* cSizes,
    +        void** resPtrs,
    +        size_t* resSizes,
    +        void** resultBufferPtr,
    +        void* compressedBuffer,
    +        size_t maxCompressedSize,
    +        BMK_timedFnState_t* timeStateCompress,
    +        BMK_timedFnState_t* timeStateDecompress,
    +
    +        const void* srcBuffer,
    +        size_t srcSize,
    +        const size_t* fileSizes,
    +        unsigned nbFiles,
    +        const int cLevel,
    +        const ZSTD_compressionParameters* comprParams,
    +        const void* dictBuffer,
    +        size_t dictBufferSize,
    +        ZSTD_CCtx* cctx,
    +        ZSTD_DCtx* dctx,
    +        int displayLevel,
    +        const char* displayName,
    +        const BMK_advancedParams_t* adv)
     {
    -    size_t const blockSize = ((adv->blockSize>=32 && (adv->mode != BMK_decodeOnly)) ? adv->blockSize : srcSize) + (!srcSize);  /* avoid div by 0 */
    +    size_t const blockSize =
    +            ((adv->blockSize >= 32 && (adv->mode != BMK_decodeOnly))
    +                     ? adv->blockSize
    +                     : srcSize)
    +            + (!srcSize); /* avoid div by 0 */
         BMK_benchResult_t benchResult;
         size_t const loadedCompressedSize = srcSize;
    -    size_t cSize = 0;
    -    double ratio = 0.;
    +    size_t cSize                      = 0;
    +    double ratio                      = 0.;
         U32 nbBlocks;
     
    -    assert(cctx != NULL); assert(dctx != NULL);
    +    assert(cctx != NULL);
    +    assert(dctx != NULL);
     
         /* init */
         memset(&benchResult, 0, sizeof(benchResult));
    -    if (strlen(displayName)>17) displayName += strlen(displayName) - 17;   /* display last 17 characters */
    +    if (strlen(displayName) > 17)
    +        displayName +=
    +                strlen(displayName) - 17; /* display last 17 characters */
         if (adv->mode == BMK_decodeOnly) {
             /* benchmark only decompression : source must be already compressed */
             const char* srcPtr = (const char*)srcBuffer;
    -        U64 totalDSize64 = 0;
    +        U64 totalDSize64   = 0;
             U32 fileNb;
    -        for (fileNb=0; fileNb decodedSize) {  /* size_t overflow */
    -                RETURN_ERROR(32, BMK_benchOutcome_t, "decompressed size is too large for local system");
    +            if (totalDSize64 > decodedSize) { /* size_t overflow */
    +                RETURN_ERROR(
    +                        32,
    +                        BMK_benchOutcome_t,
    +                        "decompressed size is too large for local system");
                 }
                 *resultBufferPtr = malloc(decodedSize);
                 if (!(*resultBufferPtr)) {
    -                RETURN_ERROR(33, BMK_benchOutcome_t, "allocation error: not enough memory");
    +                RETURN_ERROR(
    +                        33,
    +                        BMK_benchOutcome_t,
    +                        "allocation error: not enough memory");
                 }
    -            cSize = srcSize;
    +            cSize   = srcSize;
                 srcSize = decodedSize;
    -            ratio = (double)srcSize / (double)cSize;
    +            ratio   = (double)srcSize / (double)cSize;
             }
         }
     
         /* Init data blocks  */
    -    {   const char* srcPtr = (const char*)srcBuffer;
    -        char* cPtr = (char*)compressedBuffer;
    -        char* resPtr = (char*)(*resultBufferPtr);
    +    {
    +        const char* srcPtr = (const char*)srcBuffer;
    +        char* cPtr         = (char*)compressedBuffer;
    +        char* resPtr       = (char*)(*resultBufferPtr);
             U32 fileNb;
    -        for (nbBlocks=0, fileNb=0; fileNbmode == BMK_decodeOnly) ? 1 : (U32)((remaining + (blockSize-1)) / blockSize);
    -            U32 const blockEnd = nbBlocks + nbBlocksforThisFile;
    -            for ( ; nbBlocksmode == BMK_decodeOnly)
    +                    ? 1
    +                    : (U32)((remaining + (blockSize - 1)) / blockSize);
    +            U32 const blockEnd            = nbBlocks + nbBlocksforThisFile;
    +            for (; nbBlocks < blockEnd; nbBlocks++) {
                     size_t const thisBlockSize = MIN(remaining, blockSize);
    -                srcPtrs[nbBlocks] = srcPtr;
    -                srcSizes[nbBlocks] = thisBlockSize;
    -                cPtrs[nbBlocks] = cPtr;
    -                cCapacities[nbBlocks] = (adv->mode == BMK_decodeOnly) ? thisBlockSize : ZSTD_compressBound(thisBlockSize);
    -                resPtrs[nbBlocks] = resPtr;
    -                resSizes[nbBlocks] = (adv->mode == BMK_decodeOnly) ? (size_t) ZSTD_findDecompressedSize(srcPtr, thisBlockSize) : thisBlockSize;
    +                srcPtrs[nbBlocks]          = srcPtr;
    +                srcSizes[nbBlocks]         = thisBlockSize;
    +                cPtrs[nbBlocks]            = cPtr;
    +                cCapacities[nbBlocks]      = (adv->mode == BMK_decodeOnly)
    +                             ? thisBlockSize
    +                             : ZSTD_compressBound(thisBlockSize);
    +                resPtrs[nbBlocks]          = resPtr;
    +                resSizes[nbBlocks]         = (adv->mode == BMK_decodeOnly)
    +                                ? (size_t)ZSTD_findDecompressedSize(
    +                                srcPtr, thisBlockSize)
    +                                : thisBlockSize;
                     srcPtr += thisBlockSize;
                     cPtr += cCapacities[nbBlocks];
                     resPtr += thisBlockSize;
                     remaining -= thisBlockSize;
                     if (adv->mode == BMK_decodeOnly) {
    -                    cSizes[nbBlocks] = thisBlockSize;
    +                    cSizes[nbBlocks]  = thisBlockSize;
                         benchResult.cSize = thisBlockSize;
    -    }   }   }   }
    +                }
    +            }
    +        }
    +    }
     
         /* warming up `compressedBuffer` */
         if (adv->mode == BMK_decodeOnly) {
    @@ -393,236 +536,329 @@ BMK_benchMemAdvancedNoAlloc(
         }
     
         if (!UTIL_support_MT_measurements() && adv->nbWorkers > 1) {
    -        OUTPUTLEVEL(2, "Warning : time measurements may be incorrect in multithreading mode... \n")
    +        OUTPUTLEVEL(
    +                2,
    +                "Warning : time measurements may be incorrect in multithreading mode... \n")
         }
     
         /* Bench */
    -    {   U64 const crcOrig = (adv->mode == BMK_decodeOnly) ? 0 : XXH64(srcBuffer, srcSize, 0);
    -#       define NB_MARKS 4
    +    {
    +        U64 const crcOrig = (adv->mode == BMK_decodeOnly)
    +                ? 0
    +                : XXH64(srcBuffer, srcSize, 0);
    +#define NB_MARKS 4
             const char* marks[NB_MARKS] = { " |", " /", " =", " \\" };
    -        U32 markNb = 0;
    -        int compressionCompleted = (adv->mode == BMK_decodeOnly);
    -        int decompressionCompleted = (adv->mode == BMK_compressOnly);
    +        U32 markNb                  = 0;
    +        int compressionCompleted    = (adv->mode == BMK_decodeOnly);
    +        int decompressionCompleted  = (adv->mode == BMK_compressOnly);
             BMK_benchParams_t cbp, dbp;
             BMK_initCCtxArgs cctxprep;
             BMK_initDCtxArgs dctxprep;
     
    -        cbp.benchFn = local_defaultCompress;   /* ZSTD_compress2 */
    -        cbp.benchPayload = cctx;
    -        cbp.initFn = local_initCCtx;   /* BMK_initCCtx */
    -        cbp.initPayload = &cctxprep;
    -        cbp.errorFn = ZSTD_isError;
    -        cbp.blockCount = nbBlocks;
    -        cbp.srcBuffers = srcPtrs;
    -        cbp.srcSizes = srcSizes;
    -        cbp.dstBuffers = cPtrs;
    +        cbp.benchFn       = local_defaultCompress; /* ZSTD_compress2 */
    +        cbp.benchPayload  = cctx;
    +        cbp.initFn        = local_initCCtx; /* BMK_initCCtx */
    +        cbp.initPayload   = &cctxprep;
    +        cbp.errorFn       = ZSTD_isError;
    +        cbp.blockCount    = nbBlocks;
    +        cbp.srcBuffers    = srcPtrs;
    +        cbp.srcSizes      = srcSizes;
    +        cbp.dstBuffers    = cPtrs;
             cbp.dstCapacities = cCapacities;
    -        cbp.blockResults = cSizes;
    +        cbp.blockResults  = cSizes;
     
    -        cctxprep.cctx = cctx;
    -        cctxprep.dictBuffer = dictBuffer;
    +        cctxprep.cctx           = cctx;
    +        cctxprep.dictBuffer     = dictBuffer;
             cctxprep.dictBufferSize = dictBufferSize;
    -        cctxprep.cLevel = cLevel;
    -        cctxprep.comprParams = comprParams;
    -        cctxprep.adv = adv;
    -
    -        dbp.benchFn = local_defaultDecompress;
    -        dbp.benchPayload = dctx;
    -        dbp.initFn = local_initDCtx;
    -        dbp.initPayload = &dctxprep;
    -        dbp.errorFn = ZSTD_isError;
    -        dbp.blockCount = nbBlocks;
    -        dbp.srcBuffers = (const void* const *) cPtrs;
    -        dbp.srcSizes = cSizes;
    -        dbp.dstBuffers = resPtrs;
    +        cctxprep.cLevel         = cLevel;
    +        cctxprep.comprParams    = comprParams;
    +        cctxprep.adv            = adv;
    +
    +        dbp.benchFn       = local_defaultDecompress;
    +        dbp.benchPayload  = dctx;
    +        dbp.initFn        = local_initDCtx;
    +        dbp.initPayload   = &dctxprep;
    +        dbp.errorFn       = ZSTD_isError;
    +        dbp.blockCount    = nbBlocks;
    +        dbp.srcBuffers    = (const void* const*)cPtrs;
    +        dbp.srcSizes      = cSizes;
    +        dbp.dstBuffers    = resPtrs;
             dbp.dstCapacities = resSizes;
    -        dbp.blockResults = NULL;
    +        dbp.blockResults  = NULL;
     
    -        dctxprep.dctx = dctx;
    -        dctxprep.dictBuffer = dictBuffer;
    +        dctxprep.dctx           = dctx;
    +        dctxprep.dictBuffer     = dictBuffer;
             dctxprep.dictBufferSize = dictBufferSize;
     
    -        OUTPUTLEVEL(2, "\r%70s\r", "");   /* blank line */
    +        OUTPUTLEVEL(2, "\r%70s\r", ""); /* blank line */
             assert(srcSize < UINT_MAX);
    -        OUTPUTLEVEL(2, "%2s-%-17.17s :%10u -> \r", marks[markNb], displayName, (unsigned)srcSize);
    +        OUTPUTLEVEL(
    +                2,
    +                "%2s-%-17.17s :%10u -> \r",
    +                marks[markNb],
    +                displayName,
    +                (unsigned)srcSize);
     
             while (!(compressionCompleted && decompressionCompleted)) {
                 if (!compressionCompleted) {
    -                BMK_runOutcome_t const cOutcome = BMK_benchTimedFn( timeStateCompress, cbp);
    +                BMK_runOutcome_t const cOutcome =
    +                        BMK_benchTimedFn(timeStateCompress, cbp);
     
                     if (!BMK_isSuccessful_runOutcome(cOutcome)) {
                         RETURN_ERROR(30, BMK_benchOutcome_t, "compression error");
                     }
     
    -                {   BMK_runTime_t const cResult = BMK_extract_runTime(cOutcome);
    -                    cSize = cResult.sumOfReturn;
    +                {
    +                    BMK_runTime_t const cResult = BMK_extract_runTime(cOutcome);
    +                    cSize                       = cResult.sumOfReturn;
                         ratio = (double)srcSize / (double)cSize;
    -                    {   BMK_benchResult_t newResult;
    -                        newResult.cSpeed = (U64)((double)srcSize * TIMELOOP_NANOSEC / cResult.nanoSecPerRun);
    +                    {
    +                        BMK_benchResult_t newResult;
    +                        newResult.cSpeed =
    +                                (U64)((double)srcSize * TIMELOOP_NANOSEC
    +                                      / cResult.nanoSecPerRun);
                             benchResult.cSize = cSize;
                             if (newResult.cSpeed > benchResult.cSpeed)
                                 benchResult.cSpeed = newResult.cSpeed;
    -                }   }
    +                    }
    +                }
     
    -                {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
    +                {
    +                    int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
                         assert(cSize < UINT_MAX);
    -                    OUTPUTLEVEL(2, "%2s-%-17.17s :%10u ->%10u (x%5.*f), %6.*f MB/s \r",
    -                            marks[markNb], displayName,
    -                            (unsigned)srcSize, (unsigned)cSize,
    -                            ratioAccuracy, ratio,
    -                            benchResult.cSpeed < (10 * MB_UNIT) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT);
    +                    OUTPUTLEVEL(
    +                            2,
    +                            "%2s-%-17.17s :%10u ->%10u (x%5.*f), %6.*f MB/s \r",
    +                            marks[markNb],
    +                            displayName,
    +                            (unsigned)srcSize,
    +                            (unsigned)cSize,
    +                            ratioAccuracy,
    +                            ratio,
    +                            benchResult.cSpeed < (10 * MB_UNIT) ? 2 : 1,
    +                            (double)benchResult.cSpeed / MB_UNIT);
                     }
    -                compressionCompleted = BMK_isCompleted_TimedFn(timeStateCompress);
    +                compressionCompleted =
    +                        BMK_isCompleted_TimedFn(timeStateCompress);
                 }
     
    -            if(!decompressionCompleted) {
    -                BMK_runOutcome_t const dOutcome = BMK_benchTimedFn(timeStateDecompress, dbp);
    +            if (!decompressionCompleted) {
    +                BMK_runOutcome_t const dOutcome =
    +                        BMK_benchTimedFn(timeStateDecompress, dbp);
     
    -                if(!BMK_isSuccessful_runOutcome(dOutcome)) {
    +                if (!BMK_isSuccessful_runOutcome(dOutcome)) {
                         RETURN_ERROR(30, BMK_benchOutcome_t, "decompression error");
                     }
     
    -                {   BMK_runTime_t const dResult = BMK_extract_runTime(dOutcome);
    -                    U64 const newDSpeed = (U64)((double)srcSize * TIMELOOP_NANOSEC / dResult.nanoSecPerRun);
    +                {
    +                    BMK_runTime_t const dResult = BMK_extract_runTime(dOutcome);
    +                    U64 const newDSpeed =
    +                            (U64)((double)srcSize * TIMELOOP_NANOSEC
    +                                  / dResult.nanoSecPerRun);
                         if (newDSpeed > benchResult.dSpeed)
                             benchResult.dSpeed = newDSpeed;
                     }
     
    -                {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
    -                    OUTPUTLEVEL(2, "%2s-%-17.17s :%10u ->%10u (x%5.*f), %6.*f MB/s, %6.1f MB/s\r",
    -                            marks[markNb], displayName,
    -                            (unsigned)srcSize, (unsigned)cSize,
    -                            ratioAccuracy, ratio,
    -                            benchResult.cSpeed < (10 * MB_UNIT) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT,
    +                {
    +                    int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
    +                    OUTPUTLEVEL(
    +                            2,
    +                            "%2s-%-17.17s :%10u ->%10u (x%5.*f), %6.*f MB/s, %6.1f MB/s\r",
    +                            marks[markNb],
    +                            displayName,
    +                            (unsigned)srcSize,
    +                            (unsigned)cSize,
    +                            ratioAccuracy,
    +                            ratio,
    +                            benchResult.cSpeed < (10 * MB_UNIT) ? 2 : 1,
    +                            (double)benchResult.cSpeed / MB_UNIT,
                                 (double)benchResult.dSpeed / MB_UNIT);
                     }
    -                decompressionCompleted = BMK_isCompleted_TimedFn(timeStateDecompress);
    +                decompressionCompleted =
    +                        BMK_isCompleted_TimedFn(timeStateDecompress);
                 }
    -            markNb = (markNb+1) % NB_MARKS;
    -        }   /* while (!(compressionCompleted && decompressionCompleted)) */
    +            markNb = (markNb + 1) % NB_MARKS;
    +        } /* while (!(compressionCompleted && decompressionCompleted)) */
     
             /* CRC Checking */
    -        {   const BYTE* resultBuffer = (const BYTE*)(*resultBufferPtr);
    -            U64 const crcCheck = XXH64(resultBuffer, srcSize, 0);
    -            if ((adv->mode == BMK_both) && (crcOrig!=crcCheck)) {
    +        {
    +            const BYTE* resultBuffer = (const BYTE*)(*resultBufferPtr);
    +            U64 const crcCheck       = XXH64(resultBuffer, srcSize, 0);
    +            if ((adv->mode == BMK_both) && (crcOrig != crcCheck)) {
                     size_t u;
                     DISPLAY("!!! WARNING !!! %14s : Invalid Checksum : %x != %x   \n",
    -                        displayName, (unsigned)crcOrig, (unsigned)crcCheck);
    -                for (u=0; u u) break;
    +                            if (bacc + srcSizes[segNb] > u)
    +                                break;
                                 bacc += srcSizes[segNb];
                             }
                             pos = (U32)(u - bacc);
                             bNb = pos / (128 KB);
    -                        DISPLAY("(sample %u, block %u, pos %u) \n", segNb, bNb, pos);
    -                        {   size_t const lowest = (u>5) ? 5 : u;
    +                        DISPLAY("(sample %u, block %u, pos %u) \n",
    +                                segNb,
    +                                bNb,
    +                                pos);
    +                        {
    +                            size_t const lowest = (u > 5) ? 5 : u;
                                 size_t n;
                                 DISPLAY("origin: ");
    -                            for (n=lowest; n>0; n--)
    -                                DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u-n]);
    +                            for (n = lowest; n > 0; n--)
    +                                DISPLAY("%02X ",
    +                                        ((const BYTE*)srcBuffer)[u - n]);
                                 DISPLAY(" :%02X:  ", ((const BYTE*)srcBuffer)[u]);
    -                            for (n=1; n<3; n++)
    -                                DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u+n]);
    +                            for (n = 1; n < 3; n++)
    +                                DISPLAY("%02X ",
    +                                        ((const BYTE*)srcBuffer)[u + n]);
                                 DISPLAY(" \n");
                                 DISPLAY("decode: ");
    -                            for (n=lowest; n>0; n--)
    -                                DISPLAY("%02X ", resultBuffer[u-n]);
    +                            for (n = lowest; n > 0; n--)
    +                                DISPLAY("%02X ", resultBuffer[u - n]);
                                 DISPLAY(" :%02X:  ", resultBuffer[u]);
    -                            for (n=1; n<3; n++)
    -                                DISPLAY("%02X ", resultBuffer[u+n]);
    +                            for (n = 1; n < 3; n++)
    +                                DISPLAY("%02X ", resultBuffer[u + n]);
                                 DISPLAY(" \n");
                             }
                             break;
                         }
    -                    if (u==srcSize-1) {  /* should never happen */
    +                    if (u == srcSize - 1) { /* should never happen */
                             DISPLAY("no difference detected\n");
                         }
    -                }   /* for (u=0; umode == BMK_both) && (crcOrig!=crcCheck)) */
    -        }   /* CRC Checking */
    +                } /* for (u=0; umode == BMK_both) && (crcOrig!=crcCheck)) */
    +        }         /* CRC Checking */
     
    -        if (displayLevel == 1) {   /* hidden display mode -q, used by python speed benchmark */
    +        if (displayLevel
    +            == 1) { /* hidden display mode -q, used by python speed benchmark */
                 double const cSpeed = (double)benchResult.cSpeed / MB_UNIT;
                 double const dSpeed = (double)benchResult.dSpeed / MB_UNIT;
                 if (adv->additionalParam) {
    -                OUTPUT("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s (param=%d)\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName, adv->additionalParam);
    +                OUTPUT("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s (param=%d)\n",
    +                       cLevel,
    +                       (int)cSize,
    +                       ratio,
    +                       cSpeed,
    +                       dSpeed,
    +                       displayName,
    +                       adv->additionalParam);
                 } else {
    -                OUTPUT("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName);
    +                OUTPUT("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s\n",
    +                       cLevel,
    +                       (int)cSize,
    +                       ratio,
    +                       cSpeed,
    +                       dSpeed,
    +                       displayName);
                 }
             }
     
             OUTPUTLEVEL(2, "%2i#\n", cLevel);
    -    }   /* Bench */
    +    } /* Bench */
     
    -    benchResult.cMem = (1ULL << (comprParams->windowLog)) + ZSTD_sizeof_CCtx(cctx);
    +    benchResult.cMem =
    +            (1ULL << (comprParams->windowLog)) + ZSTD_sizeof_CCtx(cctx);
         return BMK_benchOutcome_setValidResult(benchResult);
     }
     
    -BMK_benchOutcome_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
    -                        void* dstBuffer, size_t dstCapacity,
    -                        const size_t* fileSizes, unsigned nbFiles,
    -                        int cLevel, const ZSTD_compressionParameters* comprParams,
    -                        const void* dictBuffer, size_t dictBufferSize,
    -                        int displayLevel, const char* displayName, const BMK_advancedParams_t* adv)
    +BMK_benchOutcome_t BMK_benchMemAdvanced(
    +        const void* srcBuffer,
    +        size_t srcSize,
    +        void* dstBuffer,
    +        size_t dstCapacity,
    +        const size_t* fileSizes,
    +        unsigned nbFiles,
    +        int cLevel,
    +        const ZSTD_compressionParameters* comprParams,
    +        const void* dictBuffer,
    +        size_t dictBufferSize,
    +        int displayLevel,
    +        const char* displayName,
    +        const BMK_advancedParams_t* adv)
     
     {
    -    int const dstParamsError = !dstBuffer ^ !dstCapacity;  /* must be both NULL or none */
    +    int const dstParamsError =
    +            !dstBuffer ^ !dstCapacity; /* must be both NULL or none */
     
    -    size_t const blockSize = ((adv->blockSize>=32 && (adv->mode != BMK_decodeOnly)) ? adv->blockSize : srcSize) + (!srcSize) /* avoid div by 0 */ ;
    -    U32 const maxNbBlocks = (U32) ((srcSize + (blockSize-1)) / blockSize) + nbFiles;
    +    size_t const blockSize =
    +            ((adv->blockSize >= 32 && (adv->mode != BMK_decodeOnly))
    +                     ? adv->blockSize
    +                     : srcSize)
    +            + (!srcSize) /* avoid div by 0 */;
    +    U32 const maxNbBlocks =
    +            (U32)((srcSize + (blockSize - 1)) / blockSize) + nbFiles;
     
         /* these are the blockTable parameters, just split up */
    -    const void ** const srcPtrs = (const void**)malloc(maxNbBlocks * sizeof(void*));
    +    const void** const srcPtrs =
    +            (const void**)malloc(maxNbBlocks * sizeof(void*));
         size_t* const srcSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
     
    -
    -    void ** const cPtrs = (void**)malloc(maxNbBlocks * sizeof(void*));
    -    size_t* const cSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
    +    void** const cPtrs        = (void**)malloc(maxNbBlocks * sizeof(void*));
    +    size_t* const cSizes      = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
         size_t* const cCapacities = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
     
    -    void ** const resPtrs = (void**)malloc(maxNbBlocks * sizeof(void*));
    +    void** const resPtrs   = (void**)malloc(maxNbBlocks * sizeof(void*));
         size_t* const resSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
     
    -    BMK_timedFnState_t* timeStateCompress = BMK_createTimedFnState(adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);
    -    BMK_timedFnState_t* timeStateDecompress = BMK_createTimedFnState(adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);
    +    BMK_timedFnState_t* timeStateCompress = BMK_createTimedFnState(
    +            adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);
    +    BMK_timedFnState_t* timeStateDecompress = BMK_createTimedFnState(
    +            adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);
     
         ZSTD_CCtx* const cctx = ZSTD_createCCtx();
         ZSTD_DCtx* const dctx = ZSTD_createDCtx();
     
    -    const size_t maxCompressedSize = dstCapacity ? dstCapacity : ZSTD_compressBound(srcSize) + (maxNbBlocks * 1024);
    +    const size_t maxCompressedSize = dstCapacity
    +            ? dstCapacity
    +            : ZSTD_compressBound(srcSize) + (maxNbBlocks * 1024);
     
    -    void* const internalDstBuffer = dstBuffer ? NULL : malloc(maxCompressedSize);
    +    void* const internalDstBuffer =
    +            dstBuffer ? NULL : malloc(maxCompressedSize);
         void* const compressedBuffer = dstBuffer ? dstBuffer : internalDstBuffer;
     
    -    BMK_benchOutcome_t outcome = BMK_benchOutcome_error();  /* error by default */
    +    BMK_benchOutcome_t outcome =
    +            BMK_benchOutcome_error(); /* error by default */
     
         void* resultBuffer = srcSize ? malloc(srcSize) : NULL;
     
    -    int const allocationincomplete = !srcPtrs || !srcSizes || !cPtrs ||
    -        !cSizes || !cCapacities || !resPtrs || !resSizes ||
    -        !timeStateCompress || !timeStateDecompress ||
    -        !cctx || !dctx ||
    -        !compressedBuffer || !resultBuffer;
    -
    +    int const allocationincomplete = !srcPtrs || !srcSizes || !cPtrs || !cSizes
    +            || !cCapacities || !resPtrs || !resSizes || !timeStateCompress
    +            || !timeStateDecompress || !cctx || !dctx || !compressedBuffer
    +            || !resultBuffer;
     
         if (!allocationincomplete && !dstParamsError) {
    -        outcome = BMK_benchMemAdvancedNoAlloc(srcPtrs, srcSizes,
    -                                            cPtrs, cCapacities, cSizes,
    -                                            resPtrs, resSizes,
    -                                            &resultBuffer,
    -                                            compressedBuffer, maxCompressedSize,
    -                                            timeStateCompress, timeStateDecompress,
    -                                            srcBuffer, srcSize,
    -                                            fileSizes, nbFiles,
    -                                            cLevel, comprParams,
    -                                            dictBuffer, dictBufferSize,
    -                                            cctx, dctx,
    -                                            displayLevel, displayName, adv);
    +        outcome = BMK_benchMemAdvancedNoAlloc(
    +                srcPtrs,
    +                srcSizes,
    +                cPtrs,
    +                cCapacities,
    +                cSizes,
    +                resPtrs,
    +                resSizes,
    +                &resultBuffer,
    +                compressedBuffer,
    +                maxCompressedSize,
    +                timeStateCompress,
    +                timeStateDecompress,
    +                srcBuffer,
    +                srcSize,
    +                fileSizes,
    +                nbFiles,
    +                cLevel,
    +                comprParams,
    +                dictBuffer,
    +                dictBufferSize,
    +                cctx,
    +                dctx,
    +                displayLevel,
    +                displayName,
    +                adv);
         }
     
         /* clean up */
    @@ -643,66 +879,104 @@ BMK_benchOutcome_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
         free(resPtrs);
         free(resSizes);
     
    -    if(allocationincomplete) {
    -        RETURN_ERROR(31, BMK_benchOutcome_t, "allocation error : not enough memory");
    +    if (allocationincomplete) {
    +        RETURN_ERROR(
    +                31, BMK_benchOutcome_t, "allocation error : not enough memory");
         }
     
    -    if(dstParamsError) {
    +    if (dstParamsError) {
             RETURN_ERROR(32, BMK_benchOutcome_t, "Dst parameters not coherent");
         }
         return outcome;
     }
     
    -BMK_benchOutcome_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
    -                        const size_t* fileSizes, unsigned nbFiles,
    -                        int cLevel, const ZSTD_compressionParameters* comprParams,
    -                        const void* dictBuffer, size_t dictBufferSize,
    -                        int displayLevel, const char* displayName) {
    -
    +BMK_benchOutcome_t BMK_benchMem(
    +        const void* srcBuffer,
    +        size_t srcSize,
    +        const size_t* fileSizes,
    +        unsigned nbFiles,
    +        int cLevel,
    +        const ZSTD_compressionParameters* comprParams,
    +        const void* dictBuffer,
    +        size_t dictBufferSize,
    +        int displayLevel,
    +        const char* displayName)
    +{
         BMK_advancedParams_t const adv = BMK_initAdvancedParams();
    -    return BMK_benchMemAdvanced(srcBuffer, srcSize,
    -                                NULL, 0,
    -                                fileSizes, nbFiles,
    -                                cLevel, comprParams,
    -                                dictBuffer, dictBufferSize,
    -                                displayLevel, displayName, &adv);
    +    return BMK_benchMemAdvanced(
    +            srcBuffer,
    +            srcSize,
    +            NULL,
    +            0,
    +            fileSizes,
    +            nbFiles,
    +            cLevel,
    +            comprParams,
    +            dictBuffer,
    +            dictBufferSize,
    +            displayLevel,
    +            displayName,
    +            &adv);
     }
     
    -static BMK_benchOutcome_t BMK_benchCLevel(const void* srcBuffer, size_t benchedSize,
    -                            const size_t* fileSizes, unsigned nbFiles,
    -                            int cLevel, const ZSTD_compressionParameters* comprParams,
    -                            const void* dictBuffer, size_t dictBufferSize,
    -                            int displayLevel, const char* displayName,
    -                            BMK_advancedParams_t const * const adv)
    +static BMK_benchOutcome_t BMK_benchCLevel(
    +        const void* srcBuffer,
    +        size_t benchedSize,
    +        const size_t* fileSizes,
    +        unsigned nbFiles,
    +        int cLevel,
    +        const ZSTD_compressionParameters* comprParams,
    +        const void* dictBuffer,
    +        size_t dictBufferSize,
    +        int displayLevel,
    +        const char* displayName,
    +        BMK_advancedParams_t const* const adv)
     {
         const char* pch = strrchr(displayName, '\\'); /* Windows */
    -    if (!pch) pch = strrchr(displayName, '/');    /* Linux */
    -    if (pch) displayName = pch+1;
    +    if (!pch)
    +        pch = strrchr(displayName, '/'); /* Linux */
    +    if (pch)
    +        displayName = pch + 1;
     
         if (adv->realTime) {
             DISPLAYLEVEL(2, "Note : switching to real-time priority \n");
             SET_REALTIME_PRIORITY;
         }
     
    -    if (displayLevel == 1 && !adv->additionalParam)   /* --quiet mode */
    +    if (displayLevel == 1 && !adv->additionalParam) /* --quiet mode */
             OUTPUT("bench %s %s: input %u bytes, %u seconds, %u KB blocks\n",
    -                ZSTD_VERSION_STRING, ZSTD_GIT_COMMIT_STRING,
    -                (unsigned)benchedSize, adv->nbSeconds, (unsigned)(adv->blockSize>>10));
    -
    -    return BMK_benchMemAdvanced(srcBuffer, benchedSize,
    -                                NULL, 0,
    -                                fileSizes, nbFiles,
    -                                cLevel, comprParams,
    -                                dictBuffer, dictBufferSize,
    -                                displayLevel, displayName, adv);
    +               ZSTD_VERSION_STRING,
    +               ZSTD_GIT_COMMIT_STRING,
    +               (unsigned)benchedSize,
    +               adv->nbSeconds,
    +               (unsigned)(adv->blockSize >> 10));
    +
    +    return BMK_benchMemAdvanced(
    +            srcBuffer,
    +            benchedSize,
    +            NULL,
    +            0,
    +            fileSizes,
    +            nbFiles,
    +            cLevel,
    +            comprParams,
    +            dictBuffer,
    +            dictBufferSize,
    +            displayLevel,
    +            displayName,
    +            adv);
     }
     
    -int BMK_syntheticTest(int cLevel, double compressibility,
    -                      const ZSTD_compressionParameters* compressionParams,
    -                      int displayLevel, const BMK_advancedParams_t* adv)
    +int BMK_syntheticTest(
    +        int cLevel,
    +        double compressibility,
    +        const ZSTD_compressionParameters* compressionParams,
    +        int displayLevel,
    +        const BMK_advancedParams_t* adv)
     {
    -    char name[20] = {0};
    -    size_t const benchedSize = 10000000;
    +    char nameBuff[20]        = { 0 };
    +    const char* name         = nameBuff;
    +    size_t const benchedSize = adv->blockSize ? adv->blockSize : 10000000;
         void* srcBuffer;
         BMK_benchOutcome_t res;
     
    @@ -719,15 +993,31 @@ int BMK_syntheticTest(int cLevel, double compressibility,
         }
     
         /* Fill input buffer */
    -    RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0);
    +    if (compressibility < 0.0) {
    +        LOREM_genBuffer(srcBuffer, benchedSize, 0);
    +        name = "Lorem ipsum";
    +    } else {
    +        RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0);
    +        formatString_u(
    +                nameBuff,
    +                sizeof(nameBuff),
    +                "Synthetic %u%%",
    +                (unsigned)(compressibility * 100));
    +    }
     
         /* Bench */
    -    snprintf (name, sizeof(name), "Synthetic %2u%%", (unsigned)(compressibility*100));
    -    res = BMK_benchCLevel(srcBuffer, benchedSize,
    -                    &benchedSize /* ? */, 1 /* ? */,
    -                    cLevel, compressionParams,
    -                    NULL, 0,  /* dictionary */
    -                    displayLevel, name, adv);
    +    res = BMK_benchCLevel(
    +            srcBuffer,
    +            benchedSize,
    +            &benchedSize /* ? */,
    +            1 /* ? */,
    +            cLevel,
    +            compressionParams,
    +            NULL,
    +            0, /* dictionary */
    +            displayLevel,
    +            name,
    +            adv);
     
         /* clean up */
         free(srcBuffer);
    @@ -735,16 +1025,15 @@ int BMK_syntheticTest(int cLevel, double compressibility,
         return !BMK_isSuccessful_benchOutcome(res);
     }
     
    -
    -
     static size_t BMK_findMaxMem(U64 requiredMem)
     {
         size_t const step = 64 MB;
    -    BYTE* testmem = NULL;
    +    BYTE* testmem     = NULL;
     
         requiredMem = (((requiredMem >> 26) + 1) << 26);
         requiredMem += step;
    -    if (requiredMem > maxMemory) requiredMem = maxMemory;
    +    if (requiredMem > maxMemory)
    +        requiredMem = maxMemory;
     
         do {
             testmem = (BYTE*)malloc((size_t)requiredMem);
    @@ -758,53 +1047,75 @@ static size_t BMK_findMaxMem(U64 requiredMem)
     /*! BMK_loadFiles() :
      *  Loads `buffer` with content of files listed within `fileNamesTable`.
      *  At most, fills `buffer` entirely. */
    -static int BMK_loadFiles(void* buffer, size_t bufferSize,
    -                         size_t* fileSizes,
    -                         const char* const * fileNamesTable, unsigned nbFiles,
    -                         int displayLevel)
    +static int BMK_loadFiles(
    +        void* buffer,
    +        size_t bufferSize,
    +        size_t* fileSizes,
    +        const char* const* fileNamesTable,
    +        unsigned nbFiles,
    +        int displayLevel)
     {
         size_t pos = 0, totalSize = 0;
         unsigned n;
    -    for (n=0; n bufferSize-pos) fileSize = bufferSize-pos, nbFiles=n;   /* buffer too small - stop after this file */
    -            {   size_t const readSize = fread(((char*)buffer)+pos, 1, (size_t)fileSize, f);
    -                if (readSize != (size_t)fileSize) RETURN_ERROR_INT(11, "could not read %s", fileNamesTable[n]);
    +            if (fileSize > bufferSize - pos)
    +                fileSize = bufferSize - pos,
    +                nbFiles  = n; /* buffer too small - stop after this file */
    +            {
    +                size_t const readSize =
    +                        fread(((char*)buffer) + pos, 1, (size_t)fileSize, f);
    +                if (readSize != (size_t)fileSize)
    +                    RETURN_ERROR_INT(
    +                            11, "could not read %s", fileNamesTable[n]);
                     pos += readSize;
                 }
                 fileSizes[n] = (size_t)fileSize;
                 totalSize += (size_t)fileSize;
                 fclose(f);
    -    }   }
    +        }
    +    }
     
    -    if (totalSize == 0) RETURN_ERROR_INT(12, "no data to bench");
    +    if (totalSize == 0)
    +        RETURN_ERROR_INT(12, "no data to bench");
         return 0;
     }
     
     int BMK_benchFilesAdvanced(
    -                        const char* const * fileNamesTable, unsigned nbFiles,
    -                        const char* dictFileName, int cLevel,
    -                        const ZSTD_compressionParameters* compressionParams,
    -                        int displayLevel, const BMK_advancedParams_t* adv)
    +        const char* const* fileNamesTable,
    +        unsigned nbFiles,
    +        const char* dictFileName,
    +        int cLevel,
    +        const ZSTD_compressionParameters* compressionParams,
    +        int displayLevel,
    +        const BMK_advancedParams_t* adv)
     {
         void* srcBuffer = NULL;
         size_t benchedSize;
    -    void* dictBuffer = NULL;
    +    void* dictBuffer      = NULL;
         size_t dictBufferSize = 0;
    -    size_t* fileSizes = NULL;
    +    size_t* fileSizes     = NULL;
         BMK_benchOutcome_t res;
         U64 const totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles);
     
    @@ -833,7 +1144,11 @@ int BMK_benchFilesAdvanced(
         if (dictFileName != NULL) {
             U64 const dictFileSize = UTIL_getFileSize(dictFileName);
             if (dictFileSize == UTIL_FILESIZE_UNKNOWN) {
    -            DISPLAYLEVEL(1, "error loading %s : %s \n", dictFileName, strerror(errno));
    +            DISPLAYLEVEL(
    +                    1,
    +                    "error loading %s : %s \n",
    +                    dictFileName,
    +                    strerror(errno));
                 free(fileSizes);
                 DISPLAYLEVEL(1, "benchmark aborted");
                 return 17;
    @@ -844,28 +1159,38 @@ int BMK_benchFilesAdvanced(
                 return 18;
             }
             dictBufferSize = (size_t)dictFileSize;
    -        dictBuffer = malloc(dictBufferSize);
    -        if (dictBuffer==NULL) {
    +        dictBuffer     = malloc(dictBufferSize);
    +        if (dictBuffer == NULL) {
                 free(fileSizes);
    -            DISPLAYLEVEL(1, "not enough memory for dictionary (%u bytes)",
    -                            (unsigned)dictBufferSize);
    +            DISPLAYLEVEL(
    +                    1,
    +                    "not enough memory for dictionary (%u bytes)",
    +                    (unsigned)dictBufferSize);
                 return 19;
             }
     
    -        {   int const errorCode = BMK_loadFiles(dictBuffer, dictBufferSize,
    -                                                fileSizes, &dictFileName /*?*/,
    -                                                1 /*?*/, displayLevel);
    +        {
    +            int const errorCode = BMK_loadFiles(
    +                    dictBuffer,
    +                    dictBufferSize,
    +                    fileSizes,
    +                    &dictFileName /*?*/,
    +                    1 /*?*/,
    +                    displayLevel);
                 if (errorCode) {
                     res = BMK_benchOutcome_error();
                     goto _cleanUp;
    -        }   }
    +            }
    +        }
         }
     
         /* Memory allocation & restrictions */
         benchedSize = BMK_findMaxMem(totalSizeToLoad * 3) / 3;
    -    if ((U64)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
    +    if ((U64)benchedSize > totalSizeToLoad)
    +        benchedSize = (size_t)totalSizeToLoad;
         if (benchedSize < totalSizeToLoad)
    -        DISPLAY("Not enough memory; testing %u MB only...\n", (unsigned)(benchedSize >> 20));
    +        DISPLAY("Not enough memory; testing %u MB only...\n",
    +                (unsigned)(benchedSize >> 20));
     
         srcBuffer = benchedSize ? malloc(benchedSize) : NULL;
         if (!srcBuffer) {
    @@ -876,25 +1201,41 @@ int BMK_benchFilesAdvanced(
         }
     
         /* Load input buffer */
    -    {   int const errorCode = BMK_loadFiles(srcBuffer, benchedSize,
    -                                        fileSizes, fileNamesTable, nbFiles,
    -                                        displayLevel);
    +    {
    +        int const errorCode = BMK_loadFiles(
    +                srcBuffer,
    +                benchedSize,
    +                fileSizes,
    +                fileNamesTable,
    +                nbFiles,
    +                displayLevel);
             if (errorCode) {
                 res = BMK_benchOutcome_error();
                 goto _cleanUp;
    -    }   }
    +        }
    +    }
     
         /* Bench */
    -    {   char mfName[20] = {0};
    -        snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
    -        {   const char* const displayName = (nbFiles > 1) ? mfName : fileNamesTable[0];
    -            res = BMK_benchCLevel(srcBuffer, benchedSize,
    -                                fileSizes, nbFiles,
    -                                cLevel, compressionParams,
    -                                dictBuffer, dictBufferSize,
    -                                displayLevel, displayName,
    -                                adv);
    -    }   }
    +    {
    +        char mfName[20] = { 0 };
    +        formatString_u(mfName, sizeof(mfName), " %u files", nbFiles);
    +        {
    +            const char* const displayName =
    +                    (nbFiles > 1) ? mfName : fileNamesTable[0];
    +            res = BMK_benchCLevel(
    +                    srcBuffer,
    +                    benchedSize,
    +                    fileSizes,
    +                    nbFiles,
    +                    cLevel,
    +                    compressionParams,
    +                    dictBuffer,
    +                    dictBufferSize,
    +                    displayLevel,
    +                    displayName,
    +                    adv);
    +        }
    +    }
     
     _cleanUp:
         free(srcBuffer);
    @@ -903,12 +1244,21 @@ int BMK_benchFilesAdvanced(
         return !BMK_isSuccessful_benchOutcome(res);
     }
     
    -
    -int BMK_benchFiles(const char* const * fileNamesTable, unsigned nbFiles,
    -                    const char* dictFileName,
    -                    int cLevel, const ZSTD_compressionParameters* compressionParams,
    -                    int displayLevel)
    +int BMK_benchFiles(
    +        const char* const* fileNamesTable,
    +        unsigned nbFiles,
    +        const char* dictFileName,
    +        int cLevel,
    +        const ZSTD_compressionParameters* compressionParams,
    +        int displayLevel)
     {
         BMK_advancedParams_t const adv = BMK_initAdvancedParams();
    -    return BMK_benchFilesAdvanced(fileNamesTable, nbFiles, dictFileName, cLevel, compressionParams, displayLevel, &adv);
    +    return BMK_benchFilesAdvanced(
    +            fileNamesTable,
    +            nbFiles,
    +            dictFileName,
    +            cLevel,
    +            compressionParams,
    +            displayLevel,
    +            &adv);
     }
    diff --git a/third-party/zstd/programs/benchzstd.h b/third-party/zstd/programs/benchzstd.h
    index f14a6819..ad3088cd 100644
    --- a/third-party/zstd/programs/benchzstd.h
    +++ b/third-party/zstd/programs/benchzstd.h
    @@ -100,6 +100,7 @@ typedef struct {
         BMK_mode_t mode;        /* 0: all, 1: compress only 2: decode only */
         unsigned nbSeconds;     /* default timing is in nbSeconds */
         size_t blockSize;       /* Maximum size of each block*/
    +    size_t targetCBlockSize;/* Approximative size of compressed blocks */
         int nbWorkers;          /* multithreading */
         unsigned realTime;      /* real time priority */
         int additionalParam;    /* used by python speed benchmark */
    @@ -126,11 +127,12 @@ int BMK_benchFilesAdvanced(
     
     /*! BMK_syntheticTest() -- called from zstdcli */
     /*  Generates a sample with datagen, using compressibility argument */
    -/*  cLevel - compression level to benchmark, errors if invalid
    - *  compressibility - determines compressibility of sample
    - *  compressionParams - basic compression Parameters
    - *  displayLevel - see benchFiles
    - *  adv - see advanced_Params_t
    +/* @cLevel - compression level to benchmark, errors if invalid
    + * @compressibility - determines compressibility of sample, range [0.0 - 1.0]
    + *        if @compressibility < 0.0, uses the lorem ipsum generator
    + * @compressionParams - basic compression Parameters
    + * @displayLevel - see benchFiles
    + * @adv - see advanced_Params_t
      * @return: 0 on success, !0 on error
      */
     int BMK_syntheticTest(int cLevel, double compressibility,
    diff --git a/third-party/zstd/programs/fileio.c b/third-party/zstd/programs/fileio.c
    index 84a0f48f..e3012a71 100644
    --- a/third-party/zstd/programs/fileio.c
    +++ b/third-party/zstd/programs/fileio.c
    @@ -527,7 +527,7 @@ static int FIO_removeFile(const char* path)
             DISPLAYLEVEL(2, "zstd: Refusing to remove non-regular file %s\n", path);
             return 0;
         }
    -#if defined(_WIN32) || defined(WIN32)
    +#if defined(_WIN32)
         /* windows doesn't allow remove read-only files,
          * so try to make it writable first */
         if (!(statbuf.st_mode & _S_IWRITE)) {
    @@ -1096,15 +1096,15 @@ static void FIO_adjustParamsForPatchFromMode(FIO_prefs_t* const prefs,
         comprParams->windowLog = MAX(ZSTD_WINDOWLOG_MIN, MIN(ZSTD_WINDOWLOG_MAX, fileWindowLog));
         if (fileWindowLog > ZSTD_cycleLog(cParams.chainLog, cParams.strategy)) {
             if (!prefs->ldmFlag)
    -            DISPLAYLEVEL(1, "long mode automatically triggered\n");
    +            DISPLAYLEVEL(2, "long mode automatically triggered\n");
             FIO_setLdmFlag(prefs, 1);
         }
         if (cParams.strategy >= ZSTD_btopt) {
    -        DISPLAYLEVEL(1, "[Optimal parser notes] Consider the following to improve patch size at the cost of speed:\n");
    -        DISPLAYLEVEL(1, "- Use --single-thread mode in the zstd cli\n");
    -        DISPLAYLEVEL(1, "- Set a larger targetLength (e.g. --zstd=targetLength=4096)\n");
    -        DISPLAYLEVEL(1, "- Set a larger chainLog (e.g. --zstd=chainLog=%u)\n", ZSTD_CHAINLOG_MAX);
    -        DISPLAYLEVEL(1, "Also consider playing around with searchLog and hashLog\n");
    +        DISPLAYLEVEL(3, "[Optimal parser notes] Consider the following to improve patch size at the cost of speed:\n");
    +        DISPLAYLEVEL(3, "- Use --single-thread mode in the zstd cli\n");
    +        DISPLAYLEVEL(3, "- Set a larger targetLength (e.g. --zstd=targetLength=4096)\n");
    +        DISPLAYLEVEL(3, "- Set a larger chainLog (e.g. --zstd=chainLog=%u)\n", ZSTD_CHAINLOG_MAX);
    +        DISPLAYLEVEL(3, "Also consider playing around with searchLog and hashLog\n");
         }
     }
     
    @@ -1839,7 +1839,6 @@ static int FIO_compressFilename_dstFile(FIO_ctx_t* const fCtx,
         int closeDstFile = 0;
         int result;
         int transferStat = 0;
    -    FILE *dstFile;
         int dstFd = -1;
     
         assert(AIO_ReadPool_getFile(ress.readCtx) != NULL);
    @@ -1854,10 +1853,11 @@ static int FIO_compressFilename_dstFile(FIO_ctx_t* const fCtx,
     
             closeDstFile = 1;
             DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: opening dst: %s \n", dstFileName);
    -        dstFile = FIO_openDstFile(fCtx, prefs, srcFileName, dstFileName, dstFileInitialPermissions);
    -        if (dstFile==NULL) return 1;  /* could not open dstFileName */
    -        dstFd = fileno(dstFile);
    -        AIO_WritePool_setFile(ress.writeCtx, dstFile);
    +        {   FILE *dstFile = FIO_openDstFile(fCtx, prefs, srcFileName, dstFileName, dstFileInitialPermissions);
    +            if (dstFile==NULL) return 1;  /* could not open dstFileName */
    +            dstFd = fileno(dstFile);
    +            AIO_WritePool_setFile(ress.writeCtx, dstFile);
    +        }
             /* Must only be added after FIO_openDstFile() succeeds.
              * Otherwise we may delete the destination file if it already exists,
              * and the user presses Ctrl-C when asked if they wish to overwrite.
    @@ -1907,6 +1907,110 @@ static const char *compressedFileExtensions[] = {
         TXZ_EXTENSION,
         LZ4_EXTENSION,
         TLZ4_EXTENSION,
    +    ".7z",
    +    ".aa3",
    +    ".aac",
    +    ".aar",
    +    ".ace",
    +    ".alac",
    +    ".ape",
    +    ".apk",
    +    ".apng",
    +    ".arc",
    +    ".archive",
    +    ".arj",
    +    ".ark",
    +    ".asf",
    +    ".avi",
    +    ".avif",
    +    ".ba",
    +    ".br",
    +    ".bz2",
    +    ".cab",
    +    ".cdx",
    +    ".chm",
    +    ".cr2",
    +    ".divx",
    +    ".dmg",
    +    ".dng",
    +    ".docm",
    +    ".docx",
    +    ".dotm",
    +    ".dotx",
    +    ".dsft",
    +    ".ear",
    +    ".eftx",
    +    ".emz",
    +    ".eot",
    +    ".epub",
    +    ".f4v",
    +    ".flac",
    +    ".flv",
    +    ".gho",
    +    ".gif",
    +    ".gifv",
    +    ".gnp",
    +    ".iso",
    +    ".jar",
    +    ".jpeg",
    +    ".jpg",
    +    ".jxl",
    +    ".lz",
    +    ".lzh",
    +    ".m4a",
    +    ".m4v",
    +    ".mkv",
    +    ".mov",
    +    ".mp2",
    +    ".mp3",
    +    ".mp4",
    +    ".mpa",
    +    ".mpc",
    +    ".mpe",
    +    ".mpeg",
    +    ".mpg",
    +    ".mpl",
    +    ".mpv",
    +    ".msi",
    +    ".odp",
    +    ".ods",
    +    ".odt",
    +    ".ogg",
    +    ".ogv",
    +    ".otp",
    +    ".ots",
    +    ".ott",
    +    ".pea",
    +    ".png",
    +    ".pptx",
    +    ".qt",
    +    ".rar",
    +    ".s7z",
    +    ".sfx",
    +    ".sit",
    +    ".sitx",
    +    ".sqx",
    +    ".svgz",
    +    ".swf",
    +    ".tbz2",
    +    ".tib",
    +    ".tlz",
    +    ".vob",
    +    ".war",
    +    ".webm",
    +    ".webp",
    +    ".wma",
    +    ".wmv",
    +    ".woff",
    +    ".woff2",
    +    ".wvl",
    +    ".xlsx",
    +    ".xpi",
    +    ".xps",
    +    ".zip",
    +    ".zipx",
    +    ".zoo",
    +    ".zpaq",
         NULL
     };
     
    @@ -2222,6 +2326,7 @@ static dRess_t FIO_createDResources(FIO_prefs_t* const prefs, const char* dictFi
         int forceNoUseMMap = prefs->mmapDict == ZSTD_ps_disable;
         stat_t statbuf;
         dRess_t ress;
    +    memset(&statbuf, 0, sizeof(statbuf));
         memset(&ress, 0, sizeof(ress));
     
         FIO_getDictFileStat(dictFileName, &statbuf);
    @@ -2336,9 +2441,10 @@ FIO_decompressZstdFrame(FIO_ctx_t* const fCtx, dRess_t* ress,
         U64 frameSize = 0;
         IOJob_t *writeJob = AIO_WritePool_acquireJob(ress->writeCtx);
     
    -    /* display last 20 characters only */
    +    /* display last 20 characters only when not --verbose */
         {   size_t const srcFileLength = strlen(srcFileName);
    -        if (srcFileLength>20) srcFileName += srcFileLength-20;
    +        if ((srcFileLength>20) && (g_display_prefs.displayLevel<3))
    +            srcFileName += srcFileLength-20;
         }
     
         ZSTD_DCtx_reset(ress->dctx, ZSTD_reset_session_only);
    diff --git a/third-party/zstd/programs/fileio_asyncio.c b/third-party/zstd/programs/fileio_asyncio.c
    index fe9cca95..ae6db69e 100644
    --- a/third-party/zstd/programs/fileio_asyncio.c
    +++ b/third-party/zstd/programs/fileio_asyncio.c
    @@ -453,8 +453,8 @@ static IOJob_t* AIO_ReadPool_findNextWaitingOffsetCompletedJob_locked(ReadPoolCt
     /* AIO_ReadPool_numReadsInFlight:
      * Returns the number of IO read jobs currently in flight. */
     static size_t AIO_ReadPool_numReadsInFlight(ReadPoolCtx_t* ctx) {
    -    const size_t jobsHeld = (ctx->currentJobHeld==NULL ? 0 : 1);
    -    return ctx->base.totalIoJobs - (ctx->base.availableJobsCount + ctx->completedJobsCount + jobsHeld);
    +    const int jobsHeld = (ctx->currentJobHeld==NULL ? 0 : 1);
    +    return (size_t)(ctx->base.totalIoJobs - (ctx->base.availableJobsCount + ctx->completedJobsCount + jobsHeld));
     }
     
     /* AIO_ReadPool_getNextCompletedJob:
    @@ -514,8 +514,7 @@ static void AIO_ReadPool_enqueueRead(ReadPoolCtx_t* ctx) {
     }
     
     static void AIO_ReadPool_startReading(ReadPoolCtx_t* ctx) {
    -    int i;
    -    for (i = 0; i < ctx->base.availableJobsCount; i++) {
    +    while(ctx->base.availableJobsCount) {
             AIO_ReadPool_enqueueRead(ctx);
         }
     }
    @@ -551,6 +550,7 @@ ReadPoolCtx_t* AIO_ReadPool_create(const FIO_prefs_t* prefs, size_t bufferSize)
         AIO_IOPool_init(&ctx->base, prefs, AIO_ReadPool_executeReadJob, bufferSize);
     
         ctx->coalesceBuffer = (U8*) malloc(bufferSize * 2);
    +    if(!ctx->coalesceBuffer) EXM_THROW(100, "Allocation error : not enough memory");
         ctx->srcBuffer = ctx->coalesceBuffer;
         ctx->srcBufferLoaded = 0;
         ctx->completedJobsCount = 0;
    diff --git a/third-party/zstd/programs/lorem.c b/third-party/zstd/programs/lorem.c
    new file mode 100644
    index 00000000..79030c92
    --- /dev/null
    +++ b/third-party/zstd/programs/lorem.c
    @@ -0,0 +1,285 @@
    +/*
    + * Copyright (c) Meta Platforms, Inc. and affiliates.
    + * All rights reserved.
    + *
    + * This source code is licensed under both the BSD-style license (found in the
    + * LICENSE file in the root directory of this source tree) and the GPLv2 (found
    + * in the COPYING file in the root directory of this source tree).
    + * You may select, at your option, one of the above-listed licenses.
    + */
    +
    +/* Implementation notes:
    + *
    + * This is a very simple lorem ipsum generator
    + * which features a static list of words
    + * and print them one after another randomly
    + * with a fake sentence / paragraph structure.
    + *
    + * The goal is to generate a printable text
    + * that can be used to fake a text compression scenario.
    + * The resulting compression / ratio curve of the lorem ipsum generator
    + * is more satisfying than the previous statistical generator,
    + * which was initially designed for entropy compression,
    + * and lacks a regularity more representative of text.
    + *
    + * The compression ratio achievable on the generated lorem ipsum
    + * is still a bit too good, presumably because the dictionary is a bit too
    + * small. It would be possible to create some more complex scheme, notably by
    + * enlarging the dictionary with a word generator, and adding grammatical rules
    + * (composition) and syntax rules. But that's probably overkill for the intended
    + * goal.
    + */
    +
    +#include "lorem.h"
    +#include 
    +#include  /* INT_MAX */
    +#include  /* memcpy */
    +
    +#define WORD_MAX_SIZE 20
    +
    +/* Define the word pool */
    +static const char* kWords[] = {
    +    "lorem",        "ipsum",      "dolor",       "sit",          "amet",
    +    "consectetur",  "adipiscing", "elit",        "sed",          "do",
    +    "eiusmod",      "tempor",     "incididunt",  "ut",           "labore",
    +    "et",           "dolore",     "magna",       "aliqua",       "dis",
    +    "lectus",       "vestibulum", "mattis",      "ullamcorper",  "velit",
    +    "commodo",      "a",          "lacus",       "arcu",         "magnis",
    +    "parturient",   "montes",     "nascetur",    "ridiculus",    "mus",
    +    "mauris",       "nulla",      "malesuada",   "pellentesque", "eget",
    +    "gravida",      "in",         "dictum",      "non",          "erat",
    +    "nam",          "voluptat",   "maecenas",    "blandit",      "aliquam",
    +    "etiam",        "enim",       "lobortis",    "scelerisque",  "fermentum",
    +    "dui",          "faucibus",   "ornare",      "at",           "elementum",
    +    "eu",           "facilisis",  "odio",        "morbi",        "quis",
    +    "eros",         "donec",      "ac",          "orci",         "purus",
    +    "turpis",       "cursus",     "leo",         "vel",          "porta",
    +    "consequat",    "interdum",   "varius",      "vulputate",    "aliquet",
    +    "pharetra",     "nunc",       "auctor",      "urna",         "id",
    +    "metus",        "viverra",    "nibh",        "cras",         "mi",
    +    "unde",         "omnis",      "iste",        "natus",        "error",
    +    "perspiciatis", "voluptatem", "accusantium", "doloremque",   "laudantium",
    +    "totam",        "rem",        "aperiam",     "eaque",        "ipsa",
    +    "quae",         "ab",         "illo",        "inventore",    "veritatis",
    +    "quasi",        "architecto", "beatae",      "vitae",        "dicta",
    +    "sunt",         "explicabo",  "nemo",        "ipsam",        "quia",
    +    "voluptas",     "aspernatur", "aut",         "odit",         "fugit",
    +    "consequuntur", "magni",      "dolores",     "eos",          "qui",
    +    "ratione",      "sequi",      "nesciunt",    "neque",        "porro",
    +    "quisquam",     "est",        "dolorem",     "adipisci",     "numquam",
    +    "eius",         "modi",       "tempora",     "incidunt",     "magnam",
    +    "quaerat",      "ad",         "minima",      "veniam",       "nostrum",
    +    "ullam",        "corporis",   "suscipit",    "laboriosam",   "nisi",
    +    "aliquid",      "ex",         "ea",          "commodi",      "consequatur",
    +    "autem",        "eum",        "iure",        "voluptate",    "esse",
    +    "quam",         "nihil",      "molestiae",   "illum",        "fugiat",
    +    "quo",          "pariatur",   "vero",        "accusamus",    "iusto",
    +    "dignissimos",  "ducimus",    "blanditiis",  "praesentium",  "voluptatum",
    +    "deleniti",     "atque",      "corrupti",    "quos",         "quas",
    +    "molestias",    "excepturi",  "sint",        "occaecati",    "cupiditate",
    +    "provident",    "similique",  "culpa",       "officia",      "deserunt",
    +    "mollitia",     "animi",      "laborum",     "dolorum",      "fuga",
    +    "harum",        "quidem",     "rerum",       "facilis",      "expedita",
    +    "distinctio",   "libero",     "tempore",     "cum",          "soluta",
    +    "nobis",        "eligendi",   "optio",       "cumque",       "impedit",
    +    "minus",        "quod",       "maxime",      "placeat",      "facere",
    +    "possimus",     "assumenda",  "repellendus", "temporibus",   "quibusdam",
    +    "officiis",     "debitis",    "saepe",       "eveniet",      "voluptates",
    +    "repudiandae",  "recusandae", "itaque",      "earum",        "hic",
    +    "tenetur",      "sapiente",   "delectus",    "reiciendis",   "cillum",
    +    "maiores",      "alias",      "perferendis", "doloribus",    "asperiores",
    +    "repellat",     "minim",      "nostrud",     "exercitation", "ullamco",
    +    "laboris",      "aliquip",    "duis",        "aute",         "irure",
    +};
    +static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);
    +
    +/* simple 1-dimension distribution, based on word's length, favors small words
    + */
    +static const int kWeights[]    = { 0, 8, 6, 4, 3, 2 };
    +static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
    +
    +#define DISTRIB_SIZE_MAX 650
    +static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
    +static unsigned g_distribCount         = 0;
    +
    +static void countFreqs(
    +        const char* words[],
    +        size_t nbWords,
    +        const int* weights,
    +        size_t nbWeights)
    +{
    +    unsigned total = 0;
    +    size_t w;
    +    for (w = 0; w < nbWords; w++) {
    +        size_t len = strlen(words[w]);
    +        int lmax;
    +        if (len >= nbWeights)
    +            len = nbWeights - 1;
    +        lmax = weights[len];
    +        total += (unsigned)lmax;
    +    }
    +    g_distribCount = total;
    +    assert(g_distribCount <= DISTRIB_SIZE_MAX);
    +}
    +
    +static void init_word_distrib(
    +        const char* words[],
    +        size_t nbWords,
    +        const int* weights,
    +        size_t nbWeights)
    +{
    +    size_t w, d = 0;
    +    countFreqs(words, nbWords, weights, nbWeights);
    +    for (w = 0; w < nbWords; w++) {
    +        size_t len = strlen(words[w]);
    +        int l, lmax;
    +        if (len >= nbWeights)
    +            len = nbWeights - 1;
    +        lmax = weights[len];
    +        for (l = 0; l < lmax; l++) {
    +            g_distrib[d++] = (int)w;
    +        }
    +    }
    +}
    +
    +/* Note: this unit only works when invoked sequentially.
    + * No concurrent access is allowed */
    +static char* g_ptr         = NULL;
    +static size_t g_nbChars    = 0;
    +static size_t g_maxChars   = 10000000;
    +static unsigned g_randRoot = 0;
    +
    +#define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
    +static unsigned LOREM_rand(unsigned range)
    +{
    +    static const unsigned prime1 = 2654435761U;
    +    static const unsigned prime2 = 2246822519U;
    +    unsigned rand32              = g_randRoot;
    +    rand32 *= prime1;
    +    rand32 ^= prime2;
    +    rand32     = RDG_rotl32(rand32, 13);
    +    g_randRoot = rand32;
    +    return (unsigned)(((unsigned long long)rand32 * range) >> 32);
    +}
    +
    +static void writeLastCharacters(void)
    +{
    +    size_t lastChars = g_maxChars - g_nbChars;
    +    assert(g_maxChars >= g_nbChars);
    +    if (lastChars == 0)
    +        return;
    +    g_ptr[g_nbChars++] = '.';
    +    if (lastChars > 2) {
    +        memset(g_ptr + g_nbChars, ' ', lastChars - 2);
    +    }
    +    if (lastChars > 1) {
    +        g_ptr[g_maxChars - 1] = '\n';
    +    }
    +    g_nbChars = g_maxChars;
    +}
    +
    +static void generateWord(const char* word, const char* separator, int upCase)
    +{
    +    size_t const len = strlen(word) + strlen(separator);
    +    if (g_nbChars + len > g_maxChars) {
    +        writeLastCharacters();
    +        return;
    +    }
    +    memcpy(g_ptr + g_nbChars, word, strlen(word));
    +    if (upCase) {
    +        static const char toUp = 'A' - 'a';
    +        g_ptr[g_nbChars]       = (char)(g_ptr[g_nbChars] + toUp);
    +    }
    +    g_nbChars += strlen(word);
    +    memcpy(g_ptr + g_nbChars, separator, strlen(separator));
    +    g_nbChars += strlen(separator);
    +}
    +
    +static int about(unsigned target)
    +{
    +    return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
    +}
    +
    +/* Function to generate a random sentence */
    +static void generateSentence(int nbWords)
    +{
    +    int commaPos       = about(9);
    +    int comma2         = commaPos + about(7);
    +    int qmark          = (LOREM_rand(11) == 7);
    +    const char* endSep = qmark ? "? " : ". ";
    +    int i;
    +    for (i = 0; i < nbWords; i++) {
    +        int const wordID       = g_distrib[LOREM_rand(g_distribCount)];
    +        const char* const word = kWords[wordID];
    +        const char* sep        = " ";
    +        if (i == commaPos)
    +            sep = ", ";
    +        if (i == comma2)
    +            sep = ", ";
    +        if (i == nbWords - 1)
    +            sep = endSep;
    +        generateWord(word, sep, i == 0);
    +    }
    +}
    +
    +static void generateParagraph(int nbSentences)
    +{
    +    int i;
    +    for (i = 0; i < nbSentences; i++) {
    +        int wordsPerSentence = about(11);
    +        generateSentence(wordsPerSentence);
    +    }
    +    if (g_nbChars < g_maxChars) {
    +        g_ptr[g_nbChars++] = '\n';
    +    }
    +    if (g_nbChars < g_maxChars) {
    +        g_ptr[g_nbChars++] = '\n';
    +    }
    +}
    +
    +/* It's "common" for lorem ipsum generators to start with the same first
    + * pre-defined sentence */
    +static void generateFirstSentence(void)
    +{
    +    int i;
    +    for (i = 0; i < 18; i++) {
    +        const char* word      = kWords[i];
    +        const char* separator = " ";
    +        if (i == 4)
    +            separator = ", ";
    +        if (i == 7)
    +            separator = ", ";
    +        generateWord(word, separator, i == 0);
    +    }
    +    generateWord(kWords[18], ". ", 0);
    +}
    +
    +size_t
    +LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
    +{
    +    g_ptr = (char*)buffer;
    +    assert(size < INT_MAX);
    +    g_maxChars = size;
    +    g_nbChars  = 0;
    +    g_randRoot = seed;
    +    if (g_distribCount == 0) {
    +        init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
    +    }
    +
    +    if (first) {
    +        generateFirstSentence();
    +    }
    +    while (g_nbChars < g_maxChars) {
    +        int sentencePerParagraph = about(7);
    +        generateParagraph(sentencePerParagraph);
    +        if (!fill)
    +            break; /* only generate one paragraph in not-fill mode */
    +    }
    +    g_ptr = NULL;
    +    return g_nbChars;
    +}
    +
    +void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
    +{
    +    LOREM_genBlock(buffer, size, seed, 1, 1);
    +}
    diff --git a/third-party/zstd/programs/lorem.h b/third-party/zstd/programs/lorem.h
    new file mode 100644
    index 00000000..4a87f874
    --- /dev/null
    +++ b/third-party/zstd/programs/lorem.h
    @@ -0,0 +1,32 @@
    +/*
    + * Copyright (c) Meta Platforms, Inc. and affiliates.
    + * All rights reserved.
    + *
    + * This source code is licensed under both the BSD-style license (found in the
    + * LICENSE file in the root directory of this source tree) and the GPLv2 (found
    + * in the COPYING file in the root directory of this source tree).
    + * You may select, at your option, one of the above-listed licenses.
    + */
    +
    +/* lorem ipsum generator */
    +
    +#include    /* size_t */
    +
    +/*
    + * LOREM_genBuffer():
    + * Generate @size bytes of compressible data using lorem ipsum generator
    + * into provided @buffer.
    + */
    +void LOREM_genBuffer(void* buffer, size_t size, unsigned seed);
    +
    +/*
    + * LOREM_genBlock():
    + * Similar to LOREM_genBuffer, with additional controls :
    + * - @first : generate the first sentence
    + * - @fill : fill the entire @buffer,
    + *           if ==0: generate one paragraph at most.
    + * @return : nb of bytes generated into @buffer.
    + */
    +size_t LOREM_genBlock(void* buffer, size_t size,
    +                      unsigned seed,
    +                      int first, int fill);
    diff --git a/third-party/zstd/programs/platform.h b/third-party/zstd/programs/platform.h
    index 18a3587b..4d2b9490 100644
    --- a/third-party/zstd/programs/platform.h
    +++ b/third-party/zstd/programs/platform.h
    @@ -74,8 +74,7 @@ extern "C" {
     ***************************************************************/
     #ifndef PLATFORM_POSIX_VERSION
     
    -#  if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1-2001 (SUSv3) conformant */ \
    -     || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)  /* BSD distros */
    +#  if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1-2001 (SUSv3) conformant */
          /* exception rule : force posix version to 200112L,
           * note: it's better to use unistd.h's _POSIX_VERSION whenever possible */
     #    define PLATFORM_POSIX_VERSION 200112L
    @@ -89,7 +88,7 @@ extern "C" {
      */
     #  elif !defined(_WIN32) \
          && ( defined(__unix__) || defined(__unix) \
    -       || defined(__midipix__) || defined(__VMS) || defined(__HAIKU__) )
    +       || defined(_QNX_SOURCE) || defined(__midipix__) || defined(__VMS) || defined(__HAIKU__) )
     
     #    if defined(__linux__) || defined(__linux) || defined(__CYGWIN__)
     #      ifndef _POSIX_C_SOURCE
    @@ -141,7 +140,7 @@ extern "C" {
     #elif defined(MSDOS) || defined(OS2)
     #  include        /* _isatty */
     #  define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
    -#elif defined(WIN32) || defined(_WIN32)
    +#elif defined(_WIN32)
     #  include       /* _isatty */
     #  include  /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */
     #  include    /* FILE */
    @@ -157,7 +156,7 @@ static __inline int IS_CONSOLE(FILE* stdStream) {
     /******************************
     *  OS-specific IO behaviors
     ******************************/
    -#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32)
    +#if defined(MSDOS) || defined(OS2) || defined(_WIN32)
     #  include    /* _O_BINARY */
     #  include       /* _setmode, _fileno, _get_osfhandle */
     #  if !defined(__DJGPP__)
    diff --git a/third-party/zstd/programs/util.c b/third-party/zstd/programs/util.c
    index c9031e91..7f65f937 100644
    --- a/third-party/zstd/programs/util.c
    +++ b/third-party/zstd/programs/util.c
    @@ -23,16 +23,27 @@ extern "C" {
     #include 
     #include 
     
    +#if defined(__FreeBSD__)
    +#include  /* __FreeBSD_version */
    +#endif /* #ifdef __FreeBSD__ */
    +
     #if defined(_WIN32)
     #  include   /* utime */
     #  include          /* _chmod */
    +#  define ZSTD_USE_UTIMENSAT 0
     #else
     #  include      /* chown, stat */
    -#  if PLATFORM_POSIX_VERSION < 200809L || !defined(st_mtime)
    -#    include     /* utime */
    +#  include    /* utimensat, st_mtime */
    +#  if (PLATFORM_POSIX_VERSION >= 200809L && defined(st_mtime)) \
    +      || (defined(__FreeBSD__) && __FreeBSD_version >= 1100056)
    +#    define ZSTD_USE_UTIMENSAT 1
     #  else
    +#    define ZSTD_USE_UTIMENSAT 0
    +#  endif
    +#  if ZSTD_USE_UTIMENSAT
     #    include     /* AT_FDCWD */
    -#    include  /* utimensat */
    +#  else
    +#    include     /* utime */
     #  endif
     #endif
     
    @@ -259,7 +270,12 @@ int UTIL_utime(const char* filename, const stat_t *statbuf)
          * that struct stat has a struct timespec st_mtim member. We need this
          * check because there are some platforms that claim to be POSIX 2008
          * compliant but which do not have st_mtim... */
    -#if (PLATFORM_POSIX_VERSION >= 200809L) && defined(st_mtime)
    +    /* FreeBSD has implemented POSIX 2008 for a long time but still only
    +     * advertises support for POSIX 2001. They have a version macro that
    +     * lets us safely gate them in.
    +     * See https://docs.freebsd.org/en/books/porters-handbook/versions/.
    +     */
    +#if ZSTD_USE_UTIMENSAT
         {
             /* (atime, mtime) */
             struct timespec timebuf[2] = { {0, UTIME_NOW} };
    @@ -660,7 +676,6 @@ UTIL_createFileNamesTable_fromFileName(const char* inputFileName)
         size_t nbFiles = 0;
         char* buf;
         size_t bufSize;
    -    size_t pos = 0;
         stat_t statbuf;
     
         if (!UTIL_stat(inputFileName, &statbuf) || !UTIL_isRegularFileStat(&statbuf))
    @@ -687,12 +702,13 @@ UTIL_createFileNamesTable_fromFileName(const char* inputFileName)
         {   const char** filenamesTable = (const char**) malloc(nbFiles * sizeof(*filenamesTable));
             CONTROL(filenamesTable != NULL);
     
    -        {   size_t fnb;
    -            for (fnb = 0, pos = 0; fnb < nbFiles; fnb++) {
    +        {   size_t fnb, pos = 0;
    +            for (fnb = 0; fnb < nbFiles; fnb++) {
                     filenamesTable[fnb] = buf+pos;
                     pos += strlen(buf+pos)+1;  /* +1 for the finishing `\0` */
    -        }   }
    +            }
             assert(pos <= bufSize);
    +        }
     
             return UTIL_assembleFileNamesTable(filenamesTable, nbFiles, buf);
         }
    @@ -753,7 +769,7 @@ void UTIL_refFilename(FileNamesTable* fnt, const char* filename)
     
     static size_t getTotalTableSize(FileNamesTable* table)
     {
    -    size_t fnb = 0, totalSize = 0;
    +    size_t fnb, totalSize = 0;
         for(fnb = 0 ; fnb < table->tableSize && table->fileNames[fnb] ; ++fnb) {
             totalSize += strlen(table->fileNames[fnb]) + 1; /* +1 to add '\0' at the end of each fileName */
         }
    @@ -1119,9 +1135,6 @@ static char* mallocAndJoin2Dir(const char *dir1, const char *dir2)
             memcpy(outDirBuffer, dir1, dir1Size);
             outDirBuffer[dir1Size] = '\0';
     
    -        if (dir2[0] == '.')
    -            return outDirBuffer;
    -
             buffer = outDirBuffer + dir1Size;
             if (dir1Size > 0 && *(buffer - 1) != PATH_SEP) {
                 *buffer = PATH_SEP;
    @@ -1546,7 +1559,6 @@ int UTIL_countCores(int logical)
     
     #elif defined(__FreeBSD__)
     
    -#include 
     #include 
     
     /* Use physical core sysctl when available
    diff --git a/third-party/zstd/programs/util.h b/third-party/zstd/programs/util.h
    index 8234646b..571d3942 100644
    --- a/third-party/zstd/programs/util.h
    +++ b/third-party/zstd/programs/util.h
    @@ -338,7 +338,7 @@ void UTIL_refFilename(FileNamesTable* fnt, const char* filename);
     FileNamesTable*
     UTIL_createExpandedFNT(const char* const* filenames, size_t nbFilenames, int followLinks);
     
    -#if defined(_WIN32) || defined(WIN32)
    +#if defined(_WIN32)
     DWORD CountSetBits(ULONG_PTR bitMask);
     #endif
     
    diff --git a/third-party/zstd/programs/zstd.1 b/third-party/zstd/programs/zstd.1
    index 383d9947..2b5a9851 100644
    --- a/third-party/zstd/programs/zstd.1
    +++ b/third-party/zstd/programs/zstd.1
    @@ -1,381 +1,566 @@
    -.TH "ZSTD" "1" "March 2023" "zstd 1.5.5" "User Commands"
    +.
    +.TH "ZSTD" "1" "March 2024" "zstd 1.5.6" "User Commands"
    +.
     .SH "NAME"
     \fBzstd\fR \- zstd, zstdmt, unzstd, zstdcat \- Compress or decompress \.zst files
    +.
     .SH "SYNOPSIS"
    -.TS
    -allbox;
    -\fBzstd\fR [\fIOPTIONS\fR] [\-	\fIINPUT\-FILE\fR] [\-o \fIOUTPUT\-FILE\fR]
    -.TE
    +\fBzstd\fR [\fIOPTIONS\fR] [\-|\fIINPUT\-FILE\fR] [\-o \fIOUTPUT\-FILE\fR]
    +.
     .P
     \fBzstdmt\fR is equivalent to \fBzstd \-T0\fR
    +.
     .P
     \fBunzstd\fR is equivalent to \fBzstd \-d\fR
    +.
     .P
     \fBzstdcat\fR is equivalent to \fBzstd \-dcf\fR
    +.
     .SH "DESCRIPTION"
    -\fBzstd\fR is a fast lossless compression algorithm and data compression tool, with command line syntax similar to \fBgzip\fR(1) and \fBxz\fR(1)\. It is based on the \fBLZ77\fR family, with further FSE & huff0 entropy stages\. \fBzstd\fR offers highly configurable compression speed, from fast modes at > 200 MB/s per core, to strong modes with excellent compression ratios\. It also features a very fast decoder, with speeds > 500 MB/s per core\.
    +\fBzstd\fR is a fast lossless compression algorithm and data compression tool, with command line syntax similar to \fBgzip\fR(1) and \fBxz\fR(1)\. It is based on the \fBLZ77\fR family, with further FSE & huff0 entropy stages\. \fBzstd\fR offers highly configurable compression speed, from fast modes at > 200 MB/s per core, to strong modes with excellent compression ratios\. It also features a very fast decoder, with speeds > 500 MB/s per core, which remains roughly stable at all compression settings\.
    +.
     .P
    -\fBzstd\fR command line syntax is generally similar to gzip, but features the following differences:
    -.IP "\[ci]" 4
    +\fBzstd\fR command line syntax is generally similar to gzip, but features the following few differences:
    +.
    +.IP "\(bu" 4
     Source files are preserved by default\. It\'s possible to remove them automatically by using the \fB\-\-rm\fR command\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     When compressing a single file, \fBzstd\fR displays progress notifications and result summary by default\. Use \fB\-q\fR to turn them off\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fBzstd\fR displays a short help page when command line is an error\. Use \fB\-q\fR to turn it off\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fBzstd\fR does not accept input from console, though it does accept \fBstdin\fR when it\'s not the console\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fBzstd\fR does not store the input\'s filename or attributes, only its contents\.
    +.
     .IP "" 0
    +.
     .P
     \fBzstd\fR processes each \fIfile\fR according to the selected operation mode\. If no \fIfiles\fR are given or \fIfile\fR is \fB\-\fR, \fBzstd\fR reads from standard input and writes the processed data to standard output\. \fBzstd\fR will refuse to write compressed data to standard output if it is a terminal: it will display an error message and skip the file\. Similarly, \fBzstd\fR will refuse to read compressed data from standard input if it is a terminal\.
    +.
     .P
     Unless \fB\-\-stdout\fR or \fB\-o\fR is specified, \fIfiles\fR are written to a new file whose name is derived from the source \fIfile\fR name:
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     When compressing, the suffix \fB\.zst\fR is appended to the source filename to get the target filename\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     When decompressing, the \fB\.zst\fR suffix is removed from the source filename to get the target filename
    +.
     .IP "" 0
    +.
     .SS "Concatenation with \.zst Files"
     It is possible to concatenate multiple \fB\.zst\fR files\. \fBzstd\fR will decompress such agglomerated file as if it was a single \fB\.zst\fR file\.
    +.
     .SH "OPTIONS"
    +.
     .SS "Integer Suffixes and Special Values"
     In most places where an integer argument is expected, an optional suffix is supported to easily indicate large integers\. There must be no space between the integer and the suffix\.
    +.
     .TP
     \fBKiB\fR
    -Multiply the integer by 1,024 (2\e^10)\. \fBKi\fR, \fBK\fR, and \fBKB\fR are accepted as synonyms for \fBKiB\fR\.
    +Multiply the integer by 1,024 (2^10)\. \fBKi\fR, \fBK\fR, and \fBKB\fR are accepted as synonyms for \fBKiB\fR\.
    +.
     .TP
     \fBMiB\fR
    -Multiply the integer by 1,048,576 (2\e^20)\. \fBMi\fR, \fBM\fR, and \fBMB\fR are accepted as synonyms for \fBMiB\fR\.
    +Multiply the integer by 1,048,576 (2^20)\. \fBMi\fR, \fBM\fR, and \fBMB\fR are accepted as synonyms for \fBMiB\fR\.
    +.
     .SS "Operation Mode"
     If multiple operation mode options are given, the last one takes effect\.
    +.
     .TP
     \fB\-z\fR, \fB\-\-compress\fR
     Compress\. This is the default operation mode when no operation mode option is specified and no other operation mode is implied from the command name (for example, \fBunzstd\fR implies \fB\-\-decompress\fR)\.
    +.
     .TP
     \fB\-d\fR, \fB\-\-decompress\fR, \fB\-\-uncompress\fR
     Decompress\.
    +.
     .TP
     \fB\-t\fR, \fB\-\-test\fR
     Test the integrity of compressed \fIfiles\fR\. This option is equivalent to \fB\-\-decompress \-\-stdout > /dev/null\fR, decompressed data is discarded and checksummed for errors\. No files are created or removed\.
    +.
     .TP
     \fB\-b#\fR
     Benchmark file(s) using compression level \fI#\fR\. See \fIBENCHMARK\fR below for a description of this operation\.
    +.
     .TP
     \fB\-\-train FILES\fR
     Use \fIFILES\fR as a training set to create a dictionary\. The training set should contain a lot of small files (> 100)\. See \fIDICTIONARY BUILDER\fR below for a description of this operation\.
    +.
     .TP
     \fB\-l\fR, \fB\-\-list\fR
     Display information related to a zstd compressed file, such as size, ratio, and checksum\. Some of these fields may not be available\. This command\'s output can be augmented with the \fB\-v\fR modifier\.
    +.
     .SS "Operation Modifiers"
    -.IP "\[ci]" 4
    -\fB\-#\fR: selects \fB#\fR compression level [1\-19] (default: 3)
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
    +\fB\-#\fR: selects \fB#\fR compression level [1\-19] (default: 3)\. Higher compression levels \fIgenerally\fR produce higher compression ratio at the expense of speed and memory\. A rough rule of thumb is that compression speed is expected to be divided by 2 every 2 levels\. Technically, each level is mapped to a set of advanced parameters (that can also be modified individually, see below)\. Because the compressor\'s behavior highly depends on the content to compress, there\'s no guarantee of a smooth progression from one level to another\.
    +.
    +.IP "\(bu" 4
     \fB\-\-ultra\fR: unlocks high compression levels 20+ (maximum 22), using a lot more memory\. Note that decompression will also require more memory when using these levels\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-fast[=#]\fR: switch to ultra\-fast compression levels\. If \fB=#\fR is not present, it defaults to \fB1\fR\. The higher the value, the faster the compression speed, at the cost of some compression ratio\. This setting overwrites compression level if one was set previously\. Similarly, if a compression level is set after \fB\-\-fast\fR, it overrides it\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-T#\fR, \fB\-\-threads=#\fR: Compress using \fB#\fR working threads (default: 1)\. If \fB#\fR is 0, attempt to detect and use the number of physical CPU cores\. In all cases, the nb of threads is capped to \fBZSTDMT_NBWORKERS_MAX\fR, which is either 64 in 32\-bit mode, or 256 for 64\-bit environments\. This modifier does nothing if \fBzstd\fR is compiled without multithread support\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-single\-thread\fR: Use a single thread for both I/O and compression\. As compression is serialized with I/O, this can be slightly slower\. Single\-thread mode features significantly lower memory usage, which can be useful for systems with limited amount of memory, such as 32\-bit systems\.
    +.
     .IP
     Note 1: this mode is the only available one when multithread support is disabled\.
    +.
     .IP
     Note 2: this mode is different from \fB\-T1\fR, which spawns 1 compression thread in parallel with I/O\. Final compressed result is also slightly different from \fB\-T1\fR\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-auto\-threads={physical,logical} (default: physical)\fR: When using a default amount of threads via \fB\-T0\fR, choose the default based on the number of detected physical or logical cores\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-adapt[=min=#,max=#]\fR: \fBzstd\fR will dynamically adapt compression level to perceived I/O conditions\. Compression level adaptation can be observed live by using command \fB\-v\fR\. Adaptation can be constrained between supplied \fBmin\fR and \fBmax\fR levels\. The feature works when combined with multi\-threading and \fB\-\-long\fR mode\. It does not work with \fB\-\-single\-thread\fR\. It sets window size to 8 MiB by default (can be changed manually, see \fBwlog\fR)\. Due to the chaotic nature of dynamic adaptation, compressed result is not reproducible\.
    +.
     .IP
     \fINote\fR: at the time of this writing, \fB\-\-adapt\fR can remain stuck at low speed when combined with multiple worker threads (>=2)\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-long[=#]\fR: enables long distance matching with \fB#\fR \fBwindowLog\fR, if \fB#\fR is not present it defaults to \fB27\fR\. This increases the window size (\fBwindowLog\fR) and memory usage for both the compressor and decompressor\. This setting is designed to improve the compression ratio for files with long matches at a large distance\.
    +.
     .IP
     Note: If \fBwindowLog\fR is set to larger than 27, \fB\-\-long=windowLog\fR or \fB\-\-memory=windowSize\fR needs to be passed to the decompressor\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-D DICT\fR: use \fBDICT\fR as Dictionary to compress or decompress FILE(s)
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-patch\-from FILE\fR: Specify the file to be used as a reference point for zstd\'s diff engine\. This is effectively dictionary compression with some convenient parameter selection, namely that \fIwindowSize\fR > \fIsrcSize\fR\.
    +.
     .IP
     Note: cannot use both this and \fB\-D\fR together\.
    +.
     .IP
     Note: \fB\-\-long\fR mode will be automatically activated if \fIchainLog\fR < \fIfileLog\fR (\fIfileLog\fR being the \fIwindowLog\fR required to cover the whole file)\. You can also manually force it\.
    +.
     .IP
     Note: for all levels, you can use \fB\-\-patch\-from\fR in \fB\-\-single\-thread\fR mode to improve compression ratio at the cost of speed\.
    +.
     .IP
     Note: for level 19, you can get increased compression ratio at the cost of speed by specifying \fB\-\-zstd=targetLength=\fR to be something large (i\.e\. 4096), and by setting a large \fB\-\-zstd=chainLog=\fR\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-rsyncable\fR: \fBzstd\fR will periodically synchronize the compression state to make the compressed file more rsync\-friendly\. There is a negligible impact to compression ratio, and a potential impact to compression speed, perceptible at higher speeds, for example when combining \fB\-\-rsyncable\fR with many parallel worker threads\. This feature does not work with \fB\-\-single\-thread\fR\. You probably don\'t want to use it with long range mode, since it will decrease the effectiveness of the synchronization points, but your mileage may vary\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-C\fR, \fB\-\-[no\-]check\fR: add integrity check computed from uncompressed data (default: enabled)
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-[no\-]content\-size\fR: enable / disable whether or not the original size of the file is placed in the header of the compressed file\. The default option is \fB\-\-content\-size\fR (meaning that the original size will be placed in the header)\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-no\-dictID\fR: do not store dictionary ID within frame header (dictionary compression)\. The decoder will have to rely on implicit knowledge about which dictionary to use, it won\'t be able to check if it\'s correct\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-M#\fR, \fB\-\-memory=#\fR: Set a memory usage limit\. By default, \fBzstd\fR uses 128 MiB for decompression as the maximum amount of memory the decompressor is allowed to use, but you can override this manually if need be in either direction (i\.e\. you can increase or decrease it)\.
    +.
     .IP
     This is also used during compression when using with \fB\-\-patch\-from=\fR\. In this case, this parameter overrides that maximum size allowed for a dictionary\. (128 MiB)\.
    +.
     .IP
     Additionally, this can be used to limit memory for dictionary training\. This parameter overrides the default limit of 2 GiB\. zstd will load training samples up to the memory limit and ignore the rest\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-stream\-size=#\fR: Sets the pledged source size of input coming from a stream\. This value must be exact, as it will be included in the produced frame header\. Incorrect stream sizes will cause an error\. This information will be used to better optimize compression parameters, resulting in better and potentially faster compression, especially for smaller source sizes\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-size\-hint=#\fR: When handling input from a stream, \fBzstd\fR must guess how large the source size will be when optimizing compression parameters\. If the stream size is relatively small, this guess may be a poor one, resulting in a higher compression ratio than expected\. This feature allows for controlling the guess when needed\. Exact guesses result in better compression ratios\. Overestimates result in slightly degraded compression ratios, while underestimates may result in significant degradation\.
    -.IP "\[ci]" 4
    -\fB\-o FILE\fR: save result into \fBFILE\fR\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
    +\fB\-\-target\-compressed\-block\-size=#\fR: Attempt to produce compressed blocks of approximately this size\. This will split larger blocks in order to approach this target\. This feature is notably useful for improved latency, when the receiver can leverage receiving early incomplete data\. This parameter defines a loose target: compressed blocks will target this size "on average", but individual blocks can still be larger or smaller\. Enabling this feature can decrease compression speed by up to ~10% at level 1\. Higher levels will see smaller relative speed regression, becoming invisible at higher settings\.
    +.
    +.IP "\(bu" 4
     \fB\-f\fR, \fB\-\-force\fR: disable input and output checks\. Allows overwriting existing files, input from console, output to stdout, operating on links, block devices, etc\. During decompression and when the output destination is stdout, pass\-through unrecognized formats as\-is\.
    -.IP "\[ci]" 4
    -\fB\-c\fR, \fB\-\-stdout\fR: write to standard output (even if it is the console); keep original files unchanged\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
    +\fB\-c\fR, \fB\-\-stdout\fR: write to standard output (even if it is the console); keep original files (disable \fB\-\-rm\fR)\.
    +.
    +.IP "\(bu" 4
    +\fB\-o FILE\fR: save result into \fBFILE\fR\. Note that this operation is in conflict with \fB\-c\fR\. If both operations are present on the command line, the last expressed one wins\.
    +.
    +.IP "\(bu" 4
     \fB\-\-[no\-]sparse\fR: enable / disable sparse FS support, to make files with many zeroes smaller on disk\. Creating sparse files may save disk space and speed up decompression by reducing the amount of disk I/O\. default: enabled when output is into a file, and disabled when output is stdout\. This setting overrides default and can force sparse mode over stdout\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-[no\-]pass\-through\fR enable / disable passing through uncompressed files as\-is\. During decompression when pass\-through is enabled, unrecognized formats will be copied as\-is from the input to the output\. By default, pass\-through will occur when the output destination is stdout and the force (\fB\-f\fR) option is set\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-rm\fR: remove source file(s) after successful compression or decompression\. This command is silently ignored if output is \fBstdout\fR\. If used in combination with \fB\-o\fR, triggers a confirmation prompt (which can be silenced with \fB\-f\fR), as this is a destructive operation\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-k\fR, \fB\-\-keep\fR: keep source file(s) after successful compression or decompression\. This is the default behavior\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-r\fR: operate recursively on directories\. It selects all files in the named directory and all its subdirectories\. This can be useful both to reduce command line typing, and to circumvent shell expansion limitations, when there are a lot of files and naming breaks the maximum size of a command line\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-filelist FILE\fR read a list of files to process as content from \fBFILE\fR\. Format is compatible with \fBls\fR output, with one file per line\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-output\-dir\-flat DIR\fR: resulting files are stored into target \fBDIR\fR directory, instead of same directory as origin file\. Be aware that this command can introduce name collision issues, if multiple files, from different directories, end up having the same name\. Collision resolution ensures first file with a given name will be present in \fBDIR\fR, while in combination with \fB\-f\fR, the last file will be present instead\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-output\-dir\-mirror DIR\fR: similar to \fB\-\-output\-dir\-flat\fR, the output files are stored underneath target \fBDIR\fR directory, but this option will replicate input directory hierarchy into output \fBDIR\fR\.
    +.
     .IP
     If input directory contains "\.\.", the files in this directory will be ignored\. If input directory is an absolute directory (i\.e\. "/var/tmp/abc"), it will be stored into the "output\-dir/var/tmp/abc"\. If there are multiple input files or directories, name collision resolution will follow the same rules as \fB\-\-output\-dir\-flat\fR\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-format=FORMAT\fR: compress and decompress in other formats\. If compiled with support, zstd can compress to or decompress from other compression algorithm formats\. Possibly available options are \fBzstd\fR, \fBgzip\fR, \fBxz\fR, \fBlzma\fR, and \fBlz4\fR\. If no such format is provided, \fBzstd\fR is the default\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-h\fR/\fB\-H\fR, \fB\-\-help\fR: display help/long help and exit
    -.IP "\[ci]" 4
    -\fB\-V\fR, \fB\-\-version\fR: display version number and exit\. Advanced: \fB\-vV\fR also displays supported formats\. \fB\-vvV\fR also displays POSIX support\. \fB\-q\fR will only display the version number, suitable for machine reading\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
    +\fB\-V\fR, \fB\-\-version\fR: display version number and immediately exit\. note that, since it exits, flags specified after \fB\-V\fR are effectively ignored\. Advanced: \fB\-vV\fR also displays supported formats\. \fB\-vvV\fR also displays POSIX support\. \fB\-qV\fR will only display the version number, suitable for machine reading\.
    +.
    +.IP "\(bu" 4
     \fB\-v\fR, \fB\-\-verbose\fR: verbose mode, display more information
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-q\fR, \fB\-\-quiet\fR: suppress warnings, interactivity, and notifications\. specify twice to suppress errors too\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-no\-progress\fR: do not display the progress bar, but keep all other messages\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
     \fB\-\-show\-default\-cparams\fR: shows the default compression parameters that will be used for a particular input file, based on the provided compression level and the input size\. If the provided file is not a regular file (e\.g\. a pipe), this flag will output the parameters used for inputs of unknown size\.
    -.IP "\[ci]" 4
    +.
    +.IP "\(bu" 4
    +\fB\-\-exclude\-compressed\fR: only compress files that are not already compressed\.
    +.
    +.IP "\(bu" 4
     \fB\-\-\fR: All arguments after \fB\-\-\fR are treated as files
    +.
     .IP "" 0
    +.
     .SS "gzip Operation Modifiers"
     When invoked via a \fBgzip\fR symlink, \fBzstd\fR will support further options that intend to mimic the \fBgzip\fR behavior:
    +.
     .TP
     \fB\-n\fR, \fB\-\-no\-name\fR
     do not store the original filename and timestamps when compressing a file\. This is the default behavior and hence a no\-op\.
    +.
     .TP
     \fB\-\-best\fR
     alias to the option \fB\-9\fR\.
    +.
     .SS "Environment Variables"
    -Employing environment variables to set parameters has security implications\. Therefore, this avenue is intentionally limited\. Only \fBZSTD_CLEVEL\fR and \fBZSTD_NBTHREADS\fR are currently supported\. They set the compression level and number of threads to use during compression, respectively\.
    +Employing environment variables to set parameters has security implications\. Therefore, this avenue is intentionally limited\. Only \fBZSTD_CLEVEL\fR and \fBZSTD_NBTHREADS\fR are currently supported\. They set the default compression level and number of threads to use during compression, respectively\.
    +.
     .P
     \fBZSTD_CLEVEL\fR can be used to set the level between 1 and 19 (the "normal" range)\. If the value of \fBZSTD_CLEVEL\fR is not a valid integer, it will be ignored with a warning message\. \fBZSTD_CLEVEL\fR just replaces the default compression level (\fB3\fR)\.
    +.
     .P
    -\fBZSTD_NBTHREADS\fR can be used to set the number of threads \fBzstd\fR will attempt to use during compression\. If the value of \fBZSTD_NBTHREADS\fR is not a valid unsigned integer, it will be ignored with a warning message\. \fBZSTD_NBTHREADS\fR has a default value of (\fB1\fR), and is capped at ZSTDMT_NBWORKERS_MAX==200\. \fBzstd\fR must be compiled with multithread support for this to have any effect\.
    +\fBZSTD_NBTHREADS\fR can be used to set the number of threads \fBzstd\fR will attempt to use during compression\. If the value of \fBZSTD_NBTHREADS\fR is not a valid unsigned integer, it will be ignored with a warning message\. \fBZSTD_NBTHREADS\fR has a default value of (\fB1\fR), and is capped at ZSTDMT_NBWORKERS_MAX==200\. \fBzstd\fR must be compiled with multithread support for this variable to have any effect\.
    +.
     .P
     They can both be overridden by corresponding command line arguments: \fB\-#\fR for compression level and \fB\-T#\fR for number of compression threads\.
    -.SH "DICTIONARY BUILDER"
    -\fBzstd\fR offers \fIdictionary\fR compression, which greatly improves efficiency on small files and messages\. It\'s possible to train \fBzstd\fR with a set of samples, the result of which is saved into a file called a \fBdictionary\fR\. Then, during compression and decompression, reference the same dictionary, using command \fB\-D dictionaryFileName\fR\. Compression of small files similar to the sample set will be greatly improved\.
    -.TP
    -\fB\-\-train FILEs\fR
    -Use FILEs as training set to create a dictionary\. The training set should ideally contain a lot of samples (> 100), and weight typically 100x the target dictionary size (for example, ~10 MB for a 100 KB dictionary)\. \fB\-\-train\fR can be combined with \fB\-r\fR to indicate a directory rather than listing all the files, which can be useful to circumvent shell expansion limits\.
    -.IP
    -Since dictionary compression is mostly effective for small files, the expectation is that the training set will only contain small files\. In the case where some samples happen to be large, only the first 128 KiB of these samples will be used for training\.
    -.IP
    -\fB\-\-train\fR supports multithreading if \fBzstd\fR is compiled with threading support (default)\. Additional advanced parameters can be specified with \fB\-\-train\-fastcover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. The slower cover dictionary builder can be accessed with \fB\-\-train\-cover\fR\. Default \fB\-\-train\fR is equivalent to \fB\-\-train\-fastcover=d=8,steps=4\fR\.
    -.TP
    -\fB\-o FILE\fR
    -Dictionary saved into \fBFILE\fR (default name: dictionary)\.
    -.TP
    -\fB\-\-maxdict=#\fR
    -Limit dictionary to specified size (default: 112640 bytes)\. As usual, quantities are expressed in bytes by default, and it\'s possible to employ suffixes (like \fBKB\fR or \fBMB\fR) to specify larger values\.
    -.TP
    -\fB\-#\fR
    -Use \fB#\fR compression level during training (optional)\. Will generate statistics more tuned for selected compression level, resulting in a \fIsmall\fR compression ratio improvement for this level\.
    -.TP
    -\fB\-B#\fR
    -Split input files into blocks of size # (default: no split)
    -.TP
    -\fB\-M#\fR, \fB\-\-memory=#\fR
    -Limit the amount of sample data loaded for training (default: 2 GB)\. Note that the default (2 GB) is also the maximum\. This parameter can be useful in situations where the training set size is not well controlled and could be potentially very large\. Since speed of the training process is directly correlated to the size of the training sample set, a smaller sample set leads to faster training\.
    -.IP
    -In situations where the training set is larger than maximum memory, the CLI will randomly select samples among the available ones, up to the maximum allowed memory budget\. This is meant to improve dictionary relevance by mitigating the potential impact of clustering, such as selecting only files from the beginning of a list sorted by modification date, or sorted by alphabetical order\. The randomization process is deterministic, so training of the same list of files with the same parameters will lead to the creation of the same dictionary\.
    -.TP
    -\fB\-\-dictID=#\fR
    -A dictionary ID is a locally unique ID\. The decoder will use this value to verify it is using the right dictionary\. By default, zstd will create a 4\-bytes random number ID\. It\'s possible to provide an explicit number ID instead\. It\'s up to the dictionary manager to not assign twice the same ID to 2 different dictionaries\. Note that short numbers have an advantage: an ID < 256 will only need 1 byte in the compressed frame header, and an ID < 65536 will only need 2 bytes\. This compares favorably to 4 bytes default\.
    -.IP
    -Note that RFC8878 reserves IDs less than 32768 and greater than or equal to 2\e^31, so they should not be used in public\.
    -.TP
    -\fB\-\-train\-cover[=k#,d=#,steps=#,split=#,shrink[=#]]\fR
    -Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. If \fIsplit\fR is not specified or split <= 0, then the default value of 100 is used\. Requires that \fId\fR <= \fIk\fR\. If \fIshrink\fR flag is not used, then the default value for \fIshrinkDict\fR of 0 is used\. If \fIshrink\fR is not specified, then the default value for \fIshrinkDictMaxRegression\fR of 1 is used\.
    -.IP
    -Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. If \fIsplit\fR is 100, all input samples are used for both training and testing to find optimal \fId\fR and \fIk\fR to build dictionary\. Supports multithreading if \fBzstd\fR is compiled with threading support\. Having \fIshrink\fR enabled takes a truncated dictionary of minimum size and doubles in size until compression ratio of the truncated dictionary is at most \fIshrinkDictMaxRegression%\fR worse than the compression ratio of the largest dictionary\.
    -.IP
    -Examples:
    -.IP
    -\fBzstd \-\-train\-cover FILEs\fR
    -.IP
    -\fBzstd \-\-train\-cover=k=50,d=8 FILEs\fR
    -.IP
    -\fBzstd \-\-train\-cover=d=8,steps=500 FILEs\fR
    -.IP
    -\fBzstd \-\-train\-cover=k=50 FILEs\fR
    -.IP
    -\fBzstd \-\-train\-cover=k=50,split=60 FILEs\fR
    -.IP
    -\fBzstd \-\-train\-cover=shrink FILEs\fR
    -.IP
    -\fBzstd \-\-train\-cover=shrink=2 FILEs\fR
    -.TP
    -\fB\-\-train\-fastcover[=k#,d=#,f=#,steps=#,split=#,accel=#]\fR
    -Same as cover but with extra parameters \fIf\fR and \fIaccel\fR and different default value of split If \fIsplit\fR is not specified, then it tries \fIsplit\fR = 75\. If \fIf\fR is not specified, then it tries \fIf\fR = 20\. Requires that 0 < \fIf\fR < 32\. If \fIaccel\fR is not specified, then it tries \fIaccel\fR = 1\. Requires that 0 < \fIaccel\fR <= 10\. Requires that \fId\fR = 6 or \fId\fR = 8\.
    -.IP
    -\fIf\fR is log of size of array that keeps track of frequency of subsegments of size \fId\fR\. The subsegment is hashed to an index in the range [0,2^\fIf\fR \- 1]\. It is possible that 2 different subsegments are hashed to the same index, and they are considered as the same subsegment when computing frequency\. Using a higher \fIf\fR reduces collision but takes longer\.
    -.IP
    -Examples:
    -.IP
    -\fBzstd \-\-train\-fastcover FILEs\fR
    -.IP
    -\fBzstd \-\-train\-fastcover=d=8,f=15,accel=2 FILEs\fR
    -.TP
    -\fB\-\-train\-legacy[=selectivity=#]\fR
    -Use legacy dictionary builder algorithm with the given dictionary \fIselectivity\fR (default: 9)\. The smaller the \fIselectivity\fR value, the denser the dictionary, improving its efficiency but reducing its achievable maximum size\. \fB\-\-train\-legacy=s=#\fR is also accepted\.
    -.IP
    -Examples:
    -.IP
    -\fBzstd \-\-train\-legacy FILEs\fR
    -.IP
    -\fBzstd \-\-train\-legacy=selectivity=8 FILEs\fR
    -.SH "BENCHMARK"
    -.TP
    -\fB\-b#\fR
    -benchmark file(s) using compression level #
    -.TP
    -\fB\-e#\fR
    -benchmark file(s) using multiple compression levels, from \fB\-b#\fR to \fB\-e#\fR (inclusive)
    -.TP
    -\fB\-i#\fR
    -minimum evaluation time, in seconds (default: 3s), benchmark mode only
    -.TP
    -\fB\-B#\fR, \fB\-\-block\-size=#\fR
    -cut file(s) into independent chunks of size # (default: no chunking)
    -.TP
    -\fB\-\-priority=rt\fR
    -set process priority to real\-time
    -.P
    -\fBOutput Format:\fR CompressionLevel#Filename: InputSize \-> OutputSize (CompressionRatio), CompressionSpeed, DecompressionSpeed
    -.P
    -\fBMethodology:\fR For both compression and decompression speed, the entire input is compressed/decompressed in\-memory to measure speed\. A run lasts at least 1 sec, so when files are small, they are compressed/decompressed several times per run, in order to improve measurement accuracy\.
    +.
     .SH "ADVANCED COMPRESSION OPTIONS"
    -### \-B#: Specify the size of each compression job\. This parameter is only available when multi\-threading is enabled\. Each compression job is run in parallel, so this value indirectly impacts the nb of active threads\. Default job size varies depending on compression level (generally \fB4 * windowSize\fR)\. \fB\-B#\fR makes it possible to manually select a custom size\. Note that job size must respect a minimum value which is enforced transparently\. This minimum is either 512 KB, or \fBoverlapSize\fR, whichever is largest\. Different job sizes will lead to non\-identical compressed frames\.
    +\fBzstd\fR provides 22 predefined regular compression levels plus the fast levels\. A compression level is translated internally into multiple advanced parameters that control the behavior of the compressor (one can observe the result of this translation with \fB\-\-show\-default\-cparams\fR)\. These advanced parameters can be overridden using advanced compression options\.
    +.
     .SS "\-\-zstd[=options]:"
    -\fBzstd\fR provides 22 predefined regular compression levels plus the fast levels\. This compression level is translated internally into a number of specific parameters that actually control the behavior of the compressor\. (You can see the result of this translation with \fB\-\-show\-default\-cparams\fR\.) These specific parameters can be overridden with advanced compression options\. The \fIoptions\fR are provided as a comma\-separated list\. You may specify only the options you want to change and the rest will be taken from the selected or default compression level\. The list of available \fIoptions\fR:
    +The \fIoptions\fR are provided as a comma\-separated list\. You may specify only the options you want to change and the rest will be taken from the selected or default compression level\. The list of available \fIoptions\fR:
    +.
     .TP
     \fBstrategy\fR=\fIstrat\fR, \fBstrat\fR=\fIstrat\fR
     Specify a strategy used by a match finder\.
    +.
     .IP
     There are 9 strategies numbered from 1 to 9, from fastest to strongest: 1=\fBZSTD_fast\fR, 2=\fBZSTD_dfast\fR, 3=\fBZSTD_greedy\fR, 4=\fBZSTD_lazy\fR, 5=\fBZSTD_lazy2\fR, 6=\fBZSTD_btlazy2\fR, 7=\fBZSTD_btopt\fR, 8=\fBZSTD_btultra\fR, 9=\fBZSTD_btultra2\fR\.
    +.
     .TP
     \fBwindowLog\fR=\fIwlog\fR, \fBwlog\fR=\fIwlog\fR
     Specify the maximum number of bits for a match distance\.
    +.
     .IP
     The higher number of increases the chance to find a match which usually improves compression ratio\. It also increases memory requirements for the compressor and decompressor\. The minimum \fIwlog\fR is 10 (1 KiB) and the maximum is 30 (1 GiB) on 32\-bit platforms and 31 (2 GiB) on 64\-bit platforms\.
    +.
     .IP
     Note: If \fBwindowLog\fR is set to larger than 27, \fB\-\-long=windowLog\fR or \fB\-\-memory=windowSize\fR needs to be passed to the decompressor\.
    +.
     .TP
     \fBhashLog\fR=\fIhlog\fR, \fBhlog\fR=\fIhlog\fR
     Specify the maximum number of bits for a hash table\.
    +.
     .IP
     Bigger hash tables cause fewer collisions which usually makes compression faster, but requires more memory during compression\.
    +.
     .IP
     The minimum \fIhlog\fR is 6 (64 entries / 256 B) and the maximum is 30 (1B entries / 4 GiB)\.
    +.
     .TP
     \fBchainLog\fR=\fIclog\fR, \fBclog\fR=\fIclog\fR
     Specify the maximum number of bits for the secondary search structure, whose form depends on the selected \fBstrategy\fR\.
    +.
     .IP
     Higher numbers of bits increases the chance to find a match which usually improves compression ratio\. It also slows down compression speed and increases memory requirements for compression\. This option is ignored for the \fBZSTD_fast\fR \fBstrategy\fR, which only has the primary hash table\.
    +.
     .IP
     The minimum \fIclog\fR is 6 (64 entries / 256 B) and the maximum is 29 (512M entries / 2 GiB) on 32\-bit platforms and 30 (1B entries / 4 GiB) on 64\-bit platforms\.
    +.
     .TP
     \fBsearchLog\fR=\fIslog\fR, \fBslog\fR=\fIslog\fR
     Specify the maximum number of searches in a hash chain or a binary tree using logarithmic scale\.
    +.
     .IP
     More searches increases the chance to find a match which usually increases compression ratio but decreases compression speed\.
    +.
     .IP
     The minimum \fIslog\fR is 1 and the maximum is \'windowLog\' \- 1\.
    +.
     .TP
     \fBminMatch\fR=\fImml\fR, \fBmml\fR=\fImml\fR
     Specify the minimum searched length of a match in a hash table\.
    +.
     .IP
     Larger search lengths usually decrease compression ratio but improve decompression speed\.
    +.
     .IP
     The minimum \fImml\fR is 3 and the maximum is 7\.
    +.
     .TP
     \fBtargetLength\fR=\fItlen\fR, \fBtlen\fR=\fItlen\fR
     The impact of this field vary depending on selected strategy\.
    +.
     .IP
     For \fBZSTD_btopt\fR, \fBZSTD_btultra\fR and \fBZSTD_btultra2\fR, it specifies the minimum match length that causes match finder to stop searching\. A larger \fBtargetLength\fR usually improves compression ratio but decreases compression speed\.
    +.
     .IP
     For \fBZSTD_fast\fR, it triggers ultra\-fast mode when > 0\. The value represents the amount of data skipped between match sampling\. Impact is reversed: a larger \fBtargetLength\fR increases compression speed but decreases compression ratio\.
    +.
     .IP
     For all other strategies, this field has no impact\.
    +.
     .IP
     The minimum \fItlen\fR is 0 and the maximum is 128 KiB\.
    +.
     .TP
     \fBoverlapLog\fR=\fIovlog\fR, \fBovlog\fR=\fIovlog\fR
     Determine \fBoverlapSize\fR, amount of data reloaded from previous job\. This parameter is only available when multithreading is enabled\. Reloading more data improves compression ratio, but decreases speed\.
    +.
     .IP
     The minimum \fIovlog\fR is 0, and the maximum is 9\. 1 means "no overlap", hence completely independent jobs\. 9 means "full overlap", meaning up to \fBwindowSize\fR is reloaded from previous job\. Reducing \fIovlog\fR by 1 reduces the reloaded amount by a factor 2\. For example, 8 means "windowSize/2", and 6 means "windowSize/8"\. Value 0 is special and means "default": \fIovlog\fR is automatically determined by \fBzstd\fR\. In which case, \fIovlog\fR will range from 6 to 9, depending on selected \fIstrat\fR\.
    +.
     .TP
     \fBldmHashLog\fR=\fIlhlog\fR, \fBlhlog\fR=\fIlhlog\fR
     Specify the maximum size for a hash table used for long distance matching\.
    +.
     .IP
     This option is ignored unless long distance matching is enabled\.
    +.
     .IP
     Bigger hash tables usually improve compression ratio at the expense of more memory during compression and a decrease in compression speed\.
    +.
     .IP
     The minimum \fIlhlog\fR is 6 and the maximum is 30 (default: 20)\.
    +.
     .TP
     \fBldmMinMatch\fR=\fIlmml\fR, \fBlmml\fR=\fIlmml\fR
     Specify the minimum searched length of a match for long distance matching\.
    +.
     .IP
     This option is ignored unless long distance matching is enabled\.
    +.
     .IP
     Larger/very small values usually decrease compression ratio\.
    +.
     .IP
     The minimum \fIlmml\fR is 4 and the maximum is 4096 (default: 64)\.
    +.
     .TP
     \fBldmBucketSizeLog\fR=\fIlblog\fR, \fBlblog\fR=\fIlblog\fR
     Specify the size of each bucket for the hash table used for long distance matching\.
    +.
     .IP
     This option is ignored unless long distance matching is enabled\.
    +.
     .IP
     Larger bucket sizes improve collision resolution but decrease compression speed\.
    +.
     .IP
     The minimum \fIlblog\fR is 1 and the maximum is 8 (default: 3)\.
    +.
     .TP
     \fBldmHashRateLog\fR=\fIlhrlog\fR, \fBlhrlog\fR=\fIlhrlog\fR
     Specify the frequency of inserting entries into the long distance matching hash table\.
    +.
     .IP
     This option is ignored unless long distance matching is enabled\.
    +.
     .IP
     Larger values will improve compression speed\. Deviating far from the default value will likely result in a decrease in compression ratio\.
    +.
     .IP
     The default value is \fBwlog \- lhlog\fR\.
    +.
     .SS "Example"
     The following parameters sets advanced compression options to something similar to predefined level 19 for files bigger than 256 KB:
    +.
     .P
     \fB\-\-zstd\fR=wlog=23,clog=23,hlog=22,slog=6,mml=3,tlen=48,strat=6
    +.
    +.SS "\-B#:"
    +Specify the size of each compression job\. This parameter is only available when multi\-threading is enabled\. Each compression job is run in parallel, so this value indirectly impacts the nb of active threads\. Default job size varies depending on compression level (generally \fB4 * windowSize\fR)\. \fB\-B#\fR makes it possible to manually select a custom size\. Note that job size must respect a minimum value which is enforced transparently\. This minimum is either 512 KB, or \fBoverlapSize\fR, whichever is largest\. Different job sizes will lead to non\-identical compressed frames\.
    +.
    +.SH "DICTIONARY BUILDER"
    +\fBzstd\fR offers \fIdictionary\fR compression, which greatly improves efficiency on small files and messages\. It\'s possible to train \fBzstd\fR with a set of samples, the result of which is saved into a file called a \fBdictionary\fR\. Then, during compression and decompression, reference the same dictionary, using command \fB\-D dictionaryFileName\fR\. Compression of small files similar to the sample set will be greatly improved\.
    +.
    +.TP
    +\fB\-\-train FILEs\fR
    +Use FILEs as training set to create a dictionary\. The training set should ideally contain a lot of samples (> 100), and weight typically 100x the target dictionary size (for example, ~10 MB for a 100 KB dictionary)\. \fB\-\-train\fR can be combined with \fB\-r\fR to indicate a directory rather than listing all the files, which can be useful to circumvent shell expansion limits\.
    +.
    +.IP
    +Since dictionary compression is mostly effective for small files, the expectation is that the training set will only contain small files\. In the case where some samples happen to be large, only the first 128 KiB of these samples will be used for training\.
    +.
    +.IP
    +\fB\-\-train\fR supports multithreading if \fBzstd\fR is compiled with threading support (default)\. Additional advanced parameters can be specified with \fB\-\-train\-fastcover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. The slower cover dictionary builder can be accessed with \fB\-\-train\-cover\fR\. Default \fB\-\-train\fR is equivalent to \fB\-\-train\-fastcover=d=8,steps=4\fR\.
    +.
    +.TP
    +\fB\-o FILE\fR
    +Dictionary saved into \fBFILE\fR (default name: dictionary)\.
    +.
    +.TP
    +\fB\-\-maxdict=#\fR
    +Limit dictionary to specified size (default: 112640 bytes)\. As usual, quantities are expressed in bytes by default, and it\'s possible to employ suffixes (like \fBKB\fR or \fBMB\fR) to specify larger values\.
    +.
    +.TP
    +\fB\-#\fR
    +Use \fB#\fR compression level during training (optional)\. Will generate statistics more tuned for selected compression level, resulting in a \fIsmall\fR compression ratio improvement for this level\.
    +.
    +.TP
    +\fB\-B#\fR
    +Split input files into blocks of size # (default: no split)
    +.
    +.TP
    +\fB\-M#\fR, \fB\-\-memory=#\fR
    +Limit the amount of sample data loaded for training (default: 2 GB)\. Note that the default (2 GB) is also the maximum\. This parameter can be useful in situations where the training set size is not well controlled and could be potentially very large\. Since speed of the training process is directly correlated to the size of the training sample set, a smaller sample set leads to faster training\.
    +.
    +.IP
    +In situations where the training set is larger than maximum memory, the CLI will randomly select samples among the available ones, up to the maximum allowed memory budget\. This is meant to improve dictionary relevance by mitigating the potential impact of clustering, such as selecting only files from the beginning of a list sorted by modification date, or sorted by alphabetical order\. The randomization process is deterministic, so training of the same list of files with the same parameters will lead to the creation of the same dictionary\.
    +.
    +.TP
    +\fB\-\-dictID=#\fR
    +A dictionary ID is a locally unique ID\. The decoder will use this value to verify it is using the right dictionary\. By default, zstd will create a 4\-bytes random number ID\. It\'s possible to provide an explicit number ID instead\. It\'s up to the dictionary manager to not assign twice the same ID to 2 different dictionaries\. Note that short numbers have an advantage: an ID < 256 will only need 1 byte in the compressed frame header, and an ID < 65536 will only need 2 bytes\. This compares favorably to 4 bytes default\.
    +.
    +.IP
    +Note that RFC8878 reserves IDs less than 32768 and greater than or equal to 2^31, so they should not be used in public\.
    +.
    +.TP
    +\fB\-\-train\-cover[=k#,d=#,steps=#,split=#,shrink[=#]]\fR
    +Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. If \fIsplit\fR is not specified or split <= 0, then the default value of 100 is used\. Requires that \fId\fR <= \fIk\fR\. If \fIshrink\fR flag is not used, then the default value for \fIshrinkDict\fR of 0 is used\. If \fIshrink\fR is not specified, then the default value for \fIshrinkDictMaxRegression\fR of 1 is used\.
    +.
    +.IP
    +Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. If \fIsplit\fR is 100, all input samples are used for both training and testing to find optimal \fId\fR and \fIk\fR to build dictionary\. Supports multithreading if \fBzstd\fR is compiled with threading support\. Having \fIshrink\fR enabled takes a truncated dictionary of minimum size and doubles in size until compression ratio of the truncated dictionary is at most \fIshrinkDictMaxRegression%\fR worse than the compression ratio of the largest dictionary\.
    +.
    +.IP
    +Examples:
    +.
    +.IP
    +\fBzstd \-\-train\-cover FILEs\fR
    +.
    +.IP
    +\fBzstd \-\-train\-cover=k=50,d=8 FILEs\fR
    +.
    +.IP
    +\fBzstd \-\-train\-cover=d=8,steps=500 FILEs\fR
    +.
    +.IP
    +\fBzstd \-\-train\-cover=k=50 FILEs\fR
    +.
    +.IP
    +\fBzstd \-\-train\-cover=k=50,split=60 FILEs\fR
    +.
    +.IP
    +\fBzstd \-\-train\-cover=shrink FILEs\fR
    +.
    +.IP
    +\fBzstd \-\-train\-cover=shrink=2 FILEs\fR
    +.
    +.TP
    +\fB\-\-train\-fastcover[=k#,d=#,f=#,steps=#,split=#,accel=#]\fR
    +Same as cover but with extra parameters \fIf\fR and \fIaccel\fR and different default value of split If \fIsplit\fR is not specified, then it tries \fIsplit\fR = 75\. If \fIf\fR is not specified, then it tries \fIf\fR = 20\. Requires that 0 < \fIf\fR < 32\. If \fIaccel\fR is not specified, then it tries \fIaccel\fR = 1\. Requires that 0 < \fIaccel\fR <= 10\. Requires that \fId\fR = 6 or \fId\fR = 8\.
    +.
    +.IP
    +\fIf\fR is log of size of array that keeps track of frequency of subsegments of size \fId\fR\. The subsegment is hashed to an index in the range [0,2^\fIf\fR \- 1]\. It is possible that 2 different subsegments are hashed to the same index, and they are considered as the same subsegment when computing frequency\. Using a higher \fIf\fR reduces collision but takes longer\.
    +.
    +.IP
    +Examples:
    +.
    +.IP
    +\fBzstd \-\-train\-fastcover FILEs\fR
    +.
    +.IP
    +\fBzstd \-\-train\-fastcover=d=8,f=15,accel=2 FILEs\fR
    +.
    +.TP
    +\fB\-\-train\-legacy[=selectivity=#]\fR
    +Use legacy dictionary builder algorithm with the given dictionary \fIselectivity\fR (default: 9)\. The smaller the \fIselectivity\fR value, the denser the dictionary, improving its efficiency but reducing its achievable maximum size\. \fB\-\-train\-legacy=s=#\fR is also accepted\.
    +.
    +.IP
    +Examples:
    +.
    +.IP
    +\fBzstd \-\-train\-legacy FILEs\fR
    +.
    +.IP
    +\fBzstd \-\-train\-legacy=selectivity=8 FILEs\fR
    +.
    +.SH "BENCHMARK"
    +The \fBzstd\fR CLI provides a benchmarking mode that can be used to easily find suitable compression parameters, or alternatively to benchmark a computer\'s performance\. Note that the results are highly dependent on the content being compressed\.
    +.
    +.TP
    +\fB\-b#\fR
    +benchmark file(s) using compression level #
    +.
    +.TP
    +\fB\-e#\fR
    +benchmark file(s) using multiple compression levels, from \fB\-b#\fR to \fB\-e#\fR (inclusive)
    +.
    +.TP
    +\fB\-d\fR
    +benchmark decompression speed only (requires providing an already zstd\-compressed content)
    +.
    +.TP
    +\fB\-i#\fR
    +minimum evaluation time, in seconds (default: 3s), benchmark mode only
    +.
    +.TP
    +\fB\-B#\fR, \fB\-\-block\-size=#\fR
    +cut file(s) into independent chunks of size # (default: no chunking)
    +.
    +.TP
    +\fB\-\-priority=rt\fR
    +set process priority to real\-time (Windows)
    +.
    +.P
    +\fBOutput Format:\fR CompressionLevel#Filename: InputSize \-> OutputSize (CompressionRatio), CompressionSpeed, DecompressionSpeed
    +.
    +.P
    +\fBMethodology:\fR For both compression and decompression speed, the entire input is compressed/decompressed in\-memory to measure speed\. A run lasts at least 1 sec, so when files are small, they are compressed/decompressed several times per run, in order to improve measurement accuracy\.
    +.
     .SH "SEE ALSO"
     \fBzstdgrep\fR(1), \fBzstdless\fR(1), \fBgzip\fR(1), \fBxz\fR(1)
    +.
     .P
     The \fIzstandard\fR format is specified in Y\. Collet, "Zstandard Compression and the \'application/zstd\' Media Type", https://www\.ietf\.org/rfc/rfc8878\.txt, Internet RFC 8878 (February 2021)\.
    +.
     .SH "BUGS"
     Report bugs at: https://github\.com/facebook/zstd/issues
    +.
     .SH "AUTHOR"
     Yann Collet
    diff --git a/third-party/zstd/programs/zstd.1.md b/third-party/zstd/programs/zstd.1.md
    index 231341b2..fcbfb457 100644
    --- a/third-party/zstd/programs/zstd.1.md
    +++ b/third-party/zstd/programs/zstd.1.md
    @@ -21,10 +21,11 @@ It is based on the **LZ77** family, with further FSE & huff0 entropy stages.
     `zstd` offers highly configurable compression speed,
     from fast modes at > 200 MB/s per core,
     to strong modes with excellent compression ratios.
    -It also features a very fast decoder, with speeds > 500 MB/s per core.
    +It also features a very fast decoder, with speeds > 500 MB/s per core,
    +which remains roughly stable at all compression settings.
     
     `zstd` command line syntax is generally similar to gzip,
    -but features the following differences:
    +but features the following few differences:
     
       - Source files are preserved by default.
         It's possible to remove them automatically by using the `--rm` command.
    @@ -105,7 +106,11 @@ the last one takes effect.
     ### Operation Modifiers
     
     * `-#`:
    -    selects `#` compression level \[1-19\] (default: 3)
    +    selects `#` compression level \[1-19\] (default: 3).
    +    Higher compression levels *generally* produce higher compression ratio at the expense of speed and memory.
    +    A rough rule of thumb is that compression speed is expected to be divided by 2 every 2 levels.
    +    Technically, each level is mapped to a set of advanced parameters (that can also be modified individually, see below).
    +    Because the compressor's behavior highly depends on the content to compress, there's no guarantee of a smooth progression from one level to another.
     * `--ultra`:
         unlocks high compression levels 20+ (maximum 22), using a lot more memory.
         Note that decompression will also require more memory when using these levels.
    @@ -218,15 +223,24 @@ the last one takes effect.
         expected. This feature allows for controlling the guess when needed.
         Exact guesses result in better compression ratios. Overestimates result in slightly
         degraded compression ratios, while underestimates may result in significant degradation.
    -* `-o FILE`:
    -    save result into `FILE`.
    +* `--target-compressed-block-size=#`:
    +    Attempt to produce compressed blocks of approximately this size.
    +    This will split larger blocks in order to approach this target.
    +    This feature is notably useful for improved latency, when the receiver can leverage receiving early incomplete data.
    +    This parameter defines a loose target: compressed blocks will target this size "on average", but individual blocks can still be larger or smaller.
    +    Enabling this feature can decrease compression speed by up to ~10% at level 1.
    +    Higher levels will see smaller relative speed regression, becoming invisible at higher settings.
     * `-f`, `--force`:
         disable input and output checks. Allows overwriting existing files, input
         from console, output to stdout, operating on links, block devices, etc.
         During decompression and when the output destination is stdout, pass-through
         unrecognized formats as-is.
     * `-c`, `--stdout`:
    -    write to standard output (even if it is the console); keep original files unchanged.
    +    write to standard output (even if it is the console); keep original files (disable `--rm`).
    +* `-o FILE`:
    +    save result into `FILE`.
    +    Note that this operation is in conflict with `-c`.
    +    If both operations are present on the command line, the last expressed one wins.
     * `--[no-]sparse`:
         enable / disable sparse FS support,
         to make files with many zeroes smaller on disk.
    @@ -283,10 +297,11 @@ the last one takes effect.
     * `-h`/`-H`, `--help`:
         display help/long help and exit
     * `-V`, `--version`:
    -    display version number and exit.
    +    display version number and immediately exit.
    +    note that, since it exits, flags specified after `-V` are effectively ignored.
         Advanced: `-vV` also displays supported formats.
         `-vvV` also displays POSIX support.
    -    `-q` will only display the version number, suitable for machine reading.
    +    `-qV` will only display the version number, suitable for machine reading.
     * `-v`, `--verbose`:
         verbose mode, display more information
     * `-q`, `--quiet`:
    @@ -297,6 +312,8 @@ the last one takes effect.
     * `--show-default-cparams`:
         shows the default compression parameters that will be used for a particular input file, based on the provided compression level and the input size.
         If the provided file is not a regular file (e.g. a pipe), this flag will output the parameters used for inputs of unknown size.
    +* `--exclude-compressed`:
    +    only compress files that are not already compressed.
     * `--`:
         All arguments after `--` are treated as files
     
    @@ -313,11 +330,10 @@ options that intend to mimic the `gzip` behavior:
     
     
     ### Environment Variables
    -
     Employing environment variables to set parameters has security implications.
     Therefore, this avenue is intentionally limited.
     Only `ZSTD_CLEVEL` and `ZSTD_NBTHREADS` are currently supported.
    -They set the compression level and number of threads to use during compression, respectively.
    +They set the default compression level and number of threads to use during compression, respectively.
     
     `ZSTD_CLEVEL` can be used to set the level between 1 and 19 (the "normal" range).
     If the value of `ZSTD_CLEVEL` is not a valid integer, it will be ignored with a warning message.
    @@ -326,12 +342,171 @@ If the value of `ZSTD_CLEVEL` is not a valid integer, it will be ignored with a
     `ZSTD_NBTHREADS` can be used to set the number of threads `zstd` will attempt to use during compression.
     If the value of `ZSTD_NBTHREADS` is not a valid unsigned integer, it will be ignored with a warning message.
     `ZSTD_NBTHREADS` has a default value of (`1`), and is capped at ZSTDMT_NBWORKERS_MAX==200.
    -`zstd` must be compiled with multithread support for this to have any effect.
    +`zstd` must be compiled with multithread support for this variable to have any effect.
     
     They can both be overridden by corresponding command line arguments:
     `-#` for compression level and `-T#` for number of compression threads.
     
     
    +ADVANCED COMPRESSION OPTIONS
    +----------------------------
    +`zstd` provides 22 predefined regular compression levels plus the fast levels.
    +A compression level is translated internally into multiple advanced parameters that control the behavior of the compressor
    +(one can observe the result of this translation with `--show-default-cparams`).
    +These advanced parameters can be overridden using advanced compression options.
    +
    +### --zstd[=options]:
    +The _options_ are provided as a comma-separated list.
    +You may specify only the options you want to change and the rest will be
    +taken from the selected or default compression level.
    +The list of available _options_:
    +
    +- `strategy`=_strat_, `strat`=_strat_:
    +    Specify a strategy used by a match finder.
    +
    +    There are 9 strategies numbered from 1 to 9, from fastest to strongest:
    +    1=`ZSTD_fast`, 2=`ZSTD_dfast`, 3=`ZSTD_greedy`,
    +    4=`ZSTD_lazy`, 5=`ZSTD_lazy2`, 6=`ZSTD_btlazy2`,
    +    7=`ZSTD_btopt`, 8=`ZSTD_btultra`, 9=`ZSTD_btultra2`.
    +
    +- `windowLog`=_wlog_, `wlog`=_wlog_:
    +    Specify the maximum number of bits for a match distance.
    +
    +    The higher number of increases the chance to find a match which usually
    +    improves compression ratio.
    +    It also increases memory requirements for the compressor and decompressor.
    +    The minimum _wlog_ is 10 (1 KiB) and the maximum is 30 (1 GiB) on 32-bit
    +    platforms and 31 (2 GiB) on 64-bit platforms.
    +
    +    Note: If `windowLog` is set to larger than 27, `--long=windowLog` or
    +    `--memory=windowSize` needs to be passed to the decompressor.
    +
    +- `hashLog`=_hlog_, `hlog`=_hlog_:
    +    Specify the maximum number of bits for a hash table.
    +
    +    Bigger hash tables cause fewer collisions which usually makes compression
    +    faster, but requires more memory during compression.
    +
    +    The minimum _hlog_ is 6 (64 entries / 256 B) and the maximum is 30 (1B entries / 4 GiB).
    +
    +- `chainLog`=_clog_, `clog`=_clog_:
    +    Specify the maximum number of bits for the secondary search structure,
    +    whose form depends on the selected `strategy`.
    +
    +    Higher numbers of bits increases the chance to find a match which usually
    +    improves compression ratio.
    +    It also slows down compression speed and increases memory requirements for
    +    compression.
    +    This option is ignored for the `ZSTD_fast` `strategy`, which only has the primary hash table.
    +
    +    The minimum _clog_ is 6 (64 entries / 256 B) and the maximum is 29 (512M entries / 2 GiB) on 32-bit platforms
    +    and 30 (1B entries / 4 GiB) on 64-bit platforms.
    +
    +- `searchLog`=_slog_, `slog`=_slog_:
    +    Specify the maximum number of searches in a hash chain or a binary tree
    +    using logarithmic scale.
    +
    +    More searches increases the chance to find a match which usually increases
    +    compression ratio but decreases compression speed.
    +
    +    The minimum _slog_ is 1 and the maximum is 'windowLog' - 1.
    +
    +- `minMatch`=_mml_, `mml`=_mml_:
    +    Specify the minimum searched length of a match in a hash table.
    +
    +    Larger search lengths usually decrease compression ratio but improve
    +    decompression speed.
    +
    +    The minimum _mml_ is 3 and the maximum is 7.
    +
    +- `targetLength`=_tlen_, `tlen`=_tlen_:
    +    The impact of this field vary depending on selected strategy.
    +
    +    For `ZSTD_btopt`, `ZSTD_btultra` and `ZSTD_btultra2`, it specifies
    +    the minimum match length that causes match finder to stop searching.
    +    A larger `targetLength` usually improves compression ratio
    +    but decreases compression speed.
    +
    +    For `ZSTD_fast`, it triggers ultra-fast mode when > 0.
    +    The value represents the amount of data skipped between match sampling.
    +    Impact is reversed: a larger `targetLength` increases compression speed
    +    but decreases compression ratio.
    +
    +    For all other strategies, this field has no impact.
    +
    +    The minimum _tlen_ is 0 and the maximum is 128 KiB.
    +
    +- `overlapLog`=_ovlog_,  `ovlog`=_ovlog_:
    +    Determine `overlapSize`, amount of data reloaded from previous job.
    +    This parameter is only available when multithreading is enabled.
    +    Reloading more data improves compression ratio, but decreases speed.
    +
    +    The minimum _ovlog_ is 0, and the maximum is 9.
    +    1 means "no overlap", hence completely independent jobs.
    +    9 means "full overlap", meaning up to `windowSize` is reloaded from previous job.
    +    Reducing _ovlog_ by 1 reduces the reloaded amount by a factor 2.
    +    For example, 8 means "windowSize/2", and 6 means "windowSize/8".
    +    Value 0 is special and means "default": _ovlog_ is automatically determined by `zstd`.
    +    In which case, _ovlog_ will range from 6 to 9, depending on selected _strat_.
    +
    +- `ldmHashLog`=_lhlog_, `lhlog`=_lhlog_:
    +    Specify the maximum size for a hash table used for long distance matching.
    +
    +    This option is ignored unless long distance matching is enabled.
    +
    +    Bigger hash tables usually improve compression ratio at the expense of more
    +    memory during compression and a decrease in compression speed.
    +
    +    The minimum _lhlog_ is 6 and the maximum is 30 (default: 20).
    +
    +- `ldmMinMatch`=_lmml_, `lmml`=_lmml_:
    +    Specify the minimum searched length of a match for long distance matching.
    +
    +    This option is ignored unless long distance matching is enabled.
    +
    +    Larger/very small values usually decrease compression ratio.
    +
    +    The minimum _lmml_ is 4 and the maximum is 4096 (default: 64).
    +
    +- `ldmBucketSizeLog`=_lblog_, `lblog`=_lblog_:
    +    Specify the size of each bucket for the hash table used for long distance
    +    matching.
    +
    +    This option is ignored unless long distance matching is enabled.
    +
    +    Larger bucket sizes improve collision resolution but decrease compression
    +    speed.
    +
    +    The minimum _lblog_ is 1 and the maximum is 8 (default: 3).
    +
    +- `ldmHashRateLog`=_lhrlog_, `lhrlog`=_lhrlog_:
    +    Specify the frequency of inserting entries into the long distance matching
    +    hash table.
    +
    +    This option is ignored unless long distance matching is enabled.
    +
    +    Larger values will improve compression speed. Deviating far from the
    +    default value will likely result in a decrease in compression ratio.
    +
    +    The default value is `wlog - lhlog`.
    +
    +### Example
    +The following parameters sets advanced compression options to something
    +similar to predefined level 19 for files bigger than 256 KB:
    +
    +`--zstd`=wlog=23,clog=23,hlog=22,slog=6,mml=3,tlen=48,strat=6
    +
    +### -B#:
    +Specify the size of each compression job.
    +This parameter is only available when multi-threading is enabled.
    +Each compression job is run in parallel, so this value indirectly impacts the nb of active threads.
    +Default job size varies depending on compression level (generally  `4 * windowSize`).
    +`-B#` makes it possible to manually select a custom size.
    +Note that job size must respect a minimum value which is enforced transparently.
    +This minimum is either 512 KB, or `overlapSize`, whichever is largest.
    +Different job sizes will lead to non-identical compressed frames.
    +
    +
     DICTIONARY BUILDER
     ------------------
     `zstd` offers _dictionary_ compression,
    @@ -484,178 +659,26 @@ Compression of small files similar to the sample set will be greatly improved.
     
     BENCHMARK
     ---------
    +The `zstd` CLI provides a benchmarking mode that can be used to easily find suitable compression parameters, or alternatively to benchmark a computer's performance.
    +Note that the results are highly dependent on the content being compressed.
     
     * `-b#`:
         benchmark file(s) using compression level #
     * `-e#`:
         benchmark file(s) using multiple compression levels, from `-b#` to `-e#` (inclusive)
    +* `-d`:
    +    benchmark decompression speed only (requires providing an already zstd-compressed content)
     * `-i#`:
         minimum evaluation time, in seconds (default: 3s), benchmark mode only
     * `-B#`, `--block-size=#`:
         cut file(s) into independent chunks of size # (default: no chunking)
     * `--priority=rt`:
    -    set process priority to real-time
    +    set process priority to real-time (Windows)
     
     **Output Format:** CompressionLevel#Filename: InputSize -> OutputSize (CompressionRatio), CompressionSpeed, DecompressionSpeed
     
     **Methodology:** For both compression and decompression speed, the entire input is compressed/decompressed in-memory to measure speed. A run lasts at least 1 sec, so when files are small, they are compressed/decompressed several times per run, in order to improve measurement accuracy.
     
    -ADVANCED COMPRESSION OPTIONS
    -----------------------------
    -### -B#:
    -Specify the size of each compression job.
    -This parameter is only available when multi-threading is enabled.
    -Each compression job is run in parallel, so this value indirectly impacts the nb of active threads.
    -Default job size varies depending on compression level (generally  `4 * windowSize`).
    -`-B#` makes it possible to manually select a custom size.
    -Note that job size must respect a minimum value which is enforced transparently.
    -This minimum is either 512 KB, or `overlapSize`, whichever is largest.
    -Different job sizes will lead to non-identical compressed frames.
    -
    -### --zstd[=options]:
    -`zstd` provides 22 predefined regular compression levels plus the fast levels.
    -This compression level is translated internally into a number of specific parameters that actually control the behavior of the compressor.
    -(You can see the result of this translation with `--show-default-cparams`.)
    -These specific parameters can be overridden with advanced compression options.
    -The _options_ are provided as a comma-separated list.
    -You may specify only the options you want to change and the rest will be
    -taken from the selected or default compression level.
    -The list of available _options_:
    -
    -- `strategy`=_strat_, `strat`=_strat_:
    -    Specify a strategy used by a match finder.
    -
    -    There are 9 strategies numbered from 1 to 9, from fastest to strongest:
    -    1=`ZSTD_fast`, 2=`ZSTD_dfast`, 3=`ZSTD_greedy`,
    -    4=`ZSTD_lazy`, 5=`ZSTD_lazy2`, 6=`ZSTD_btlazy2`,
    -    7=`ZSTD_btopt`, 8=`ZSTD_btultra`, 9=`ZSTD_btultra2`.
    -
    -- `windowLog`=_wlog_, `wlog`=_wlog_:
    -    Specify the maximum number of bits for a match distance.
    -
    -    The higher number of increases the chance to find a match which usually
    -    improves compression ratio.
    -    It also increases memory requirements for the compressor and decompressor.
    -    The minimum _wlog_ is 10 (1 KiB) and the maximum is 30 (1 GiB) on 32-bit
    -    platforms and 31 (2 GiB) on 64-bit platforms.
    -
    -    Note: If `windowLog` is set to larger than 27, `--long=windowLog` or
    -    `--memory=windowSize` needs to be passed to the decompressor.
    -
    -- `hashLog`=_hlog_, `hlog`=_hlog_:
    -    Specify the maximum number of bits for a hash table.
    -
    -    Bigger hash tables cause fewer collisions which usually makes compression
    -    faster, but requires more memory during compression.
    -
    -    The minimum _hlog_ is 6 (64 entries / 256 B) and the maximum is 30 (1B entries / 4 GiB).
    -
    -- `chainLog`=_clog_, `clog`=_clog_:
    -    Specify the maximum number of bits for the secondary search structure,
    -    whose form depends on the selected `strategy`.
    -
    -    Higher numbers of bits increases the chance to find a match which usually
    -    improves compression ratio.
    -    It also slows down compression speed and increases memory requirements for
    -    compression.
    -    This option is ignored for the `ZSTD_fast` `strategy`, which only has the primary hash table.
    -
    -    The minimum _clog_ is 6 (64 entries / 256 B) and the maximum is 29 (512M entries / 2 GiB) on 32-bit platforms
    -    and 30 (1B entries / 4 GiB) on 64-bit platforms.
    -
    -- `searchLog`=_slog_, `slog`=_slog_:
    -    Specify the maximum number of searches in a hash chain or a binary tree
    -    using logarithmic scale.
    -
    -    More searches increases the chance to find a match which usually increases
    -    compression ratio but decreases compression speed.
    -
    -    The minimum _slog_ is 1 and the maximum is 'windowLog' - 1.
    -
    -- `minMatch`=_mml_, `mml`=_mml_:
    -    Specify the minimum searched length of a match in a hash table.
    -
    -    Larger search lengths usually decrease compression ratio but improve
    -    decompression speed.
    -
    -    The minimum _mml_ is 3 and the maximum is 7.
    -
    -- `targetLength`=_tlen_, `tlen`=_tlen_:
    -    The impact of this field vary depending on selected strategy.
    -
    -    For `ZSTD_btopt`, `ZSTD_btultra` and `ZSTD_btultra2`, it specifies
    -    the minimum match length that causes match finder to stop searching.
    -    A larger `targetLength` usually improves compression ratio
    -    but decreases compression speed.
    -
    -    For `ZSTD_fast`, it triggers ultra-fast mode when > 0.
    -    The value represents the amount of data skipped between match sampling.
    -    Impact is reversed: a larger `targetLength` increases compression speed
    -    but decreases compression ratio.
    -
    -    For all other strategies, this field has no impact.
    -
    -    The minimum _tlen_ is 0 and the maximum is 128 KiB.
    -
    -- `overlapLog`=_ovlog_,  `ovlog`=_ovlog_:
    -    Determine `overlapSize`, amount of data reloaded from previous job.
    -    This parameter is only available when multithreading is enabled.
    -    Reloading more data improves compression ratio, but decreases speed.
    -
    -    The minimum _ovlog_ is 0, and the maximum is 9.
    -    1 means "no overlap", hence completely independent jobs.
    -    9 means "full overlap", meaning up to `windowSize` is reloaded from previous job.
    -    Reducing _ovlog_ by 1 reduces the reloaded amount by a factor 2.
    -    For example, 8 means "windowSize/2", and 6 means "windowSize/8".
    -    Value 0 is special and means "default": _ovlog_ is automatically determined by `zstd`.
    -    In which case, _ovlog_ will range from 6 to 9, depending on selected _strat_.
    -
    -- `ldmHashLog`=_lhlog_, `lhlog`=_lhlog_:
    -    Specify the maximum size for a hash table used for long distance matching.
    -
    -    This option is ignored unless long distance matching is enabled.
    -
    -    Bigger hash tables usually improve compression ratio at the expense of more
    -    memory during compression and a decrease in compression speed.
    -
    -    The minimum _lhlog_ is 6 and the maximum is 30 (default: 20).
    -
    -- `ldmMinMatch`=_lmml_, `lmml`=_lmml_:
    -    Specify the minimum searched length of a match for long distance matching.
    -
    -    This option is ignored unless long distance matching is enabled.
    -
    -    Larger/very small values usually decrease compression ratio.
    -
    -    The minimum _lmml_ is 4 and the maximum is 4096 (default: 64).
    -
    -- `ldmBucketSizeLog`=_lblog_, `lblog`=_lblog_:
    -    Specify the size of each bucket for the hash table used for long distance
    -    matching.
    -
    -    This option is ignored unless long distance matching is enabled.
    -
    -    Larger bucket sizes improve collision resolution but decrease compression
    -    speed.
    -
    -    The minimum _lblog_ is 1 and the maximum is 8 (default: 3).
    -
    -- `ldmHashRateLog`=_lhrlog_, `lhrlog`=_lhrlog_:
    -    Specify the frequency of inserting entries into the long distance matching
    -    hash table.
    -
    -    This option is ignored unless long distance matching is enabled.
    -
    -    Larger values will improve compression speed. Deviating far from the
    -    default value will likely result in a decrease in compression ratio.
    -
    -    The default value is `wlog - lhlog`.
    -
    -### Example
    -The following parameters sets advanced compression options to something
    -similar to predefined level 19 for files bigger than 256 KB:
    -
    -`--zstd`=wlog=23,clog=23,hlog=22,slog=6,mml=3,tlen=48,strat=6
     
     SEE ALSO
     --------
    diff --git a/third-party/zstd/programs/zstdcli.c b/third-party/zstd/programs/zstdcli.c
    index d2465456..9dd6b051 100644
    --- a/third-party/zstd/programs/zstdcli.c
    +++ b/third-party/zstd/programs/zstdcli.c
    @@ -138,8 +138,8 @@ static int exeNameMatch(const char* exeName, const char* test)
     *  Command Line
     **************************************/
     /* print help either in `stderr` or `stdout` depending on originating request
    - * error (badusage) => stderr
    - * help (usage_advanced) => stdout
    + * error (badUsage) => stderr
    + * help (usageAdvanced) => stdout
      */
     static void usage(FILE* f, const char* programName)
     {
    @@ -175,7 +175,7 @@ static void usage(FILE* f, const char* programName)
         DISPLAY_F(f, "\n");
     }
     
    -static void usage_advanced(const char* programName)
    +static void usageAdvanced(const char* programName)
     {
         DISPLAYOUT(WELCOME_MESSAGE);
         DISPLAYOUT("\n");
    @@ -254,7 +254,7 @@ static void usage_advanced(const char* programName)
     
         DISPLAYOUT("\n");
         DISPLAYOUT("  --format=zstd                 Compress files to the `.zst` format. [Default]\n");
    -    DISPLAYOUT("  --mmap-dict                   Memory-map dictionary file rather than mallocing and loading all at once");
    +    DISPLAYOUT("  --[no-]mmap-dict              Memory-map dictionary file rather than mallocing and loading all at once\n");
     #ifdef ZSTD_GZCOMPRESS
         DISPLAYOUT("  --format=gzip                 Compress files to the `.gz` format.\n");
     #endif
    @@ -316,9 +316,9 @@ static void usage_advanced(const char* programName)
     
     }
     
    -static void badusage(const char* programName)
    +static void badUsage(const char* programName, const char* parameter)
     {
    -    DISPLAYLEVEL(1, "Incorrect parameters \n");
    +    DISPLAYLEVEL(1, "Incorrect parameter: %s \n", parameter);
         if (g_displayLevel >= 2) usage(stderr, programName);
     }
     
    @@ -589,7 +589,7 @@ static ZDICT_fastCover_params_t defaultFastCoverParams(void)
     
     
     /** parseAdaptParameters() :
    - *  reads adapt parameters from *stringPtr (e.g. "--zstd=min=1,max=19) and store them into adaptMinPtr and adaptMaxPtr.
    + *  reads adapt parameters from *stringPtr (e.g. "--adapt=min=1,max=19) and store them into adaptMinPtr and adaptMaxPtr.
      *  Both adaptMinPtr and adaptMaxPtr must be already allocated and correctly initialized.
      *  There is no guarantee that any of these values will be updated.
      *  @return 1 means that parsing was successful,
    @@ -856,7 +856,7 @@ int main(int argCount, const char* argv[])
         ZSTD_paramSwitch_e useRowMatchFinder = ZSTD_ps_auto;
         FIO_compressionType_t cType = FIO_zstdCompression;
         unsigned nbWorkers = 0;
    -    double compressibility = 0.5;
    +    double compressibility = -1.0;  /* lorem ipsum generator */
         unsigned bench_nbSeconds = 3;   /* would be better if this value was synchronized from bench */
         size_t blockSize = 0;
     
    @@ -933,6 +933,7 @@ int main(int argCount, const char* argv[])
         /* command switches */
         for (argNb=1; argNb&2
    +zstd --blah
    +println "+ zstd -xz" >&2
    +zstd -xz
    +println "+ zstd --adapt=min=1,maxx=2 file.txt" >&2
    +zstd --adapt=min=1,maxx=2 file.txt
    +println "+ zstd --train-cover=k=48,d=8,steps32 file.txt" >&2
    +zstd --train-cover=k=48,d=8,steps32 file.txt
    diff --git a/third-party/zstd/tests/cli-tests/basic/args.sh.exit b/third-party/zstd/tests/cli-tests/basic/args.sh.exit
    new file mode 100644
    index 00000000..d00491fd
    --- /dev/null
    +++ b/third-party/zstd/tests/cli-tests/basic/args.sh.exit
    @@ -0,0 +1 @@
    +1
    diff --git a/third-party/zstd/tests/cli-tests/basic/args.sh.stderr.glob b/third-party/zstd/tests/cli-tests/basic/args.sh.stderr.glob
    new file mode 100644
    index 00000000..df275471
    --- /dev/null
    +++ b/third-party/zstd/tests/cli-tests/basic/args.sh.stderr.glob
    @@ -0,0 +1,28 @@
    ++ zstd --blah
    +Incorrect parameter: --blah
    +...
    +Usage: zstd *
    +
    +Options:
    +...
    ++ zstd -xz
    +Incorrect parameter: -x
    +...
    +Usage: zstd *
    +
    +Options:
    +...
    ++ zstd --adapt=min=1,maxx=2 file.txt
    +Incorrect parameter: --adapt=min=1,maxx=2
    +...
    +Usage: zstd *
    +
    +Options:
    +...
    ++ zstd --train-cover=k=48,d=8,steps32 file.txt
    +Incorrect parameter: --train-cover=k=48,d=8,steps32
    +...
    +Usage: zstd *
    +
    +Options:
    +...
    diff --git a/third-party/zstd/tests/cli-tests/decompression/detectErrors.sh b/third-party/zstd/tests/cli-tests/decompression/detectErrors.sh
    new file mode 100755
    index 00000000..300cde36
    --- /dev/null
    +++ b/third-party/zstd/tests/cli-tests/decompression/detectErrors.sh
    @@ -0,0 +1,11 @@
    +#!/bin/sh
    +
    +set -e
    +
    +GOLDEN_DIR="$ZSTD_REPO_DIR/tests/golden-decompression-errors/"
    +
    +for file in "$GOLDEN_DIR"/*; do
    +    zstd -t $file && die "should have detected an error"
    +done
    +exit 0
    +
    diff --git a/third-party/zstd/tests/cli-tests/file-handling/directory-mirror.sh b/third-party/zstd/tests/cli-tests/file-handling/directory-mirror.sh
    new file mode 100755
    index 00000000..b2f70b59
    --- /dev/null
    +++ b/third-party/zstd/tests/cli-tests/file-handling/directory-mirror.sh
    @@ -0,0 +1,49 @@
    +#!/bin/sh
    +set -e
    +
    +# setup
    +mkdir -p src/.hidden src/dir
    +mkdir mid dst
    +
    +echo "file1" > src/file1
    +echo "file2" > src/.file2
    +echo "file3" > src/.hidden/.file3
    +echo "file4" > src/dir/.file4
    +
    +# relative paths
    +zstd -q -r --output-dir-mirror mid/ src/
    +zstd -q -d -r --output-dir-mirror dst/ mid/src/
    +
    +diff --brief --recursive --new-file src/ dst/mid/src/
    +
    +# reset
    +rm -rf mid dst
    +mkdir mid dst
    +
    +# from inside the directory
    +(cd src; zstd -q -r --output-dir-mirror ../mid/ ./)
    +(cd mid; zstd -q -d -r --output-dir-mirror ../dst/ ./)
    +
    +diff --brief --recursive --new-file src/ dst/
    +
    +# reset
    +rm -rf mid dst
    +mkdir mid dst
    +
    +# absolute paths
    +export BASE_PATH="$(pwd)"
    +
    +zstd -q -r --output-dir-mirror mid/ "${BASE_PATH}/src/"
    +zstd -q -d -r --output-dir-mirror  dst/ "${BASE_PATH}/mid/${BASE_PATH}/src/"
    +
    +diff --brief --recursive --new-file src/ "dst/${BASE_PATH}/mid/${BASE_PATH}/src/"
    +
    +# reset
    +rm -rf mid dst
    +mkdir mid dst
    +
    +# dots
    +zstd -q -r --output-dir-mirror mid/ ./src/./
    +zstd -q -d -r --output-dir-mirror  dst/ ./mid/./src/./
    +
    +diff --brief --recursive --new-file src/ dst/mid/src/
    diff --git a/third-party/zstd/tests/cli-tests/file-handling/directory-mirror.sh.stderr.exact b/third-party/zstd/tests/cli-tests/file-handling/directory-mirror.sh.stderr.exact
    new file mode 100644
    index 00000000..e69de29b
    diff --git a/third-party/zstd/tests/cli-tests/file-handling/directory-mirror.sh.stdout.exact b/third-party/zstd/tests/cli-tests/file-handling/directory-mirror.sh.stdout.exact
    new file mode 100644
    index 00000000..e69de29b
    diff --git a/third-party/zstd/tests/datagencli.c b/third-party/zstd/tests/datagencli.c
    index 09ec5e9a..56616bef 100644
    --- a/third-party/zstd/tests/datagencli.c
    +++ b/third-party/zstd/tests/datagencli.c
    @@ -8,122 +8,141 @@
      * You may select, at your option, one of the above-listed licenses.
      */
     
    -
     /*-************************************
    -*  Dependencies
    -**************************************/
    -#include "util.h"      /* Compiler options */
    -#include      /* fprintf, stderr */
    -#include "datagen.h"   /* RDG_generate */
    -
    + *  Dependencies
    + **************************************/
    +#include     /* fprintf, stderr */
    +#include "datagen.h"  /* RDG_generate */
    +#include "loremOut.h" /* LOREM_genOut */
    +#include "util.h"     /* Compiler options */
     
     /*-************************************
    -*  Constants
    -**************************************/
    -#define KB *(1 <<10)
    -#define MB *(1 <<20)
    -#define GB *(1U<<30)
    + *  Constants
    + **************************************/
    +#define KB *(1 << 10)
    +#define MB *(1 << 20)
    +#define GB *(1U << 30)
     
     #define SIZE_DEFAULT ((64 KB) + 1)
     #define SEED_DEFAULT 0
    -#define COMPRESSIBILITY_DEFAULT 50
    -
    +#define COMPRESSIBILITY_DEFAULT 9999
     
     /*-************************************
    -*  Macros
    -**************************************/
    -#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
    -#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
    + *  Macros
    + **************************************/
    +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
    +#define DISPLAYLEVEL(l, ...)  \
    +    if (displayLevel >= l) {  \
    +        DISPLAY(__VA_ARGS__); \
    +    }
     static unsigned displayLevel = 2;
     
    -
     /*-*******************************************************
    -*  Command line
    -*********************************************************/
    + *  Command line
    + *********************************************************/
     static int usage(const char* programName)
     {
    -    DISPLAY( "Compressible data generator\n");
    -    DISPLAY( "Usage :\n");
    -    DISPLAY( "      %s [args]\n", programName);
    -    DISPLAY( "\n");
    -    DISPLAY( "Arguments :\n");
    -    DISPLAY( " -g#    : generate # data (default:%i)\n", SIZE_DEFAULT);
    -    DISPLAY( " -s#    : Select seed (default:%i)\n", SEED_DEFAULT);
    -    DISPLAY( " -P#    : Select compressibility in %% (default:%i%%)\n",
    -                        COMPRESSIBILITY_DEFAULT);
    -    DISPLAY( " -h     : display help and exit\n");
    +    DISPLAY("Compressible data generator\n");
    +    DISPLAY("Usage :\n");
    +    DISPLAY("      %s [args]\n", programName);
    +    DISPLAY("\n");
    +    DISPLAY("Arguments :\n");
    +    DISPLAY(" -g#    : generate # data (default:%i)\n", SIZE_DEFAULT);
    +    DISPLAY(" -s#    : Select seed (default:%i)\n", SEED_DEFAULT);
    +    DISPLAY(" -P#    : Select compressibility in %% (range [0-100])\n");
    +    DISPLAY(" -h     : display help and exit\n");
         return 0;
     }
     
    -
     int main(int argc, const char** argv)
     {
    -    unsigned probaU32 = COMPRESSIBILITY_DEFAULT;
    -    double litProba = 0.0;
    -    U64 size = SIZE_DEFAULT;
    -    U32 seed = SEED_DEFAULT;
    +    unsigned probaU32             = COMPRESSIBILITY_DEFAULT;
    +    double litProba               = 0.0;
    +    U64 size                      = SIZE_DEFAULT;
    +    U32 seed                      = SEED_DEFAULT;
         const char* const programName = argv[0];
     
         int argNb;
    -    for(argNb=1; argNb='0') && (*argument<='9'))
    -                        size *= 10, size += *argument++ - '0';
    -                    if (*argument=='K') { size <<= 10; argument++; }
    -                    if (*argument=='M') { size <<= 20; argument++; }
    -                    if (*argument=='G') { size <<= 30; argument++; }
    -                    if (*argument=='B') { argument++; }
    -                    break;
    -                case 's':
    -                    argument++;
    -                    seed=0;
    -                    while ((*argument>='0') && (*argument<='9'))
    -                        seed *= 10, seed += *argument++ - '0';
    -                    break;
    -                case 'P':
    -                    argument++;
    -                    probaU32 = 0;
    -                    while ((*argument>='0') && (*argument<='9'))
    -                        probaU32 *= 10, probaU32 += *argument++ - '0';
    -                    if (probaU32>100) probaU32 = 100;
    -                    break;
    -                case 'L':   /* hidden argument : Literal distribution probability */
    -                    argument++;
    -                    litProba=0.;
    -                    while ((*argument>='0') && (*argument<='9'))
    -                        litProba *= 10, litProba += *argument++ - '0';
    -                    if (litProba>100.) litProba=100.;
    -                    litProba /= 100.;
    -                    break;
    -                case 'v':
    -                    displayLevel = 4;
    -                    argument++;
    -                    break;
    -                default:
    -                    return usage(programName);
    +            while (*argument != 0) {
    +                switch (*argument) {
    +                    case 'h':
    +                        return usage(programName);
    +                    case 'g':
    +                        argument++;
    +                        size = 0;
    +                        while ((*argument >= '0') && (*argument <= '9'))
    +                            size *= 10, size += (U64)(*argument++ - '0');
    +                        if (*argument == 'K') {
    +                            size <<= 10;
    +                            argument++;
    +                        }
    +                        if (*argument == 'M') {
    +                            size <<= 20;
    +                            argument++;
    +                        }
    +                        if (*argument == 'G') {
    +                            size <<= 30;
    +                            argument++;
    +                        }
    +                        if (*argument == 'B') {
    +                            argument++;
    +                        }
    +                        break;
    +                    case 's':
    +                        argument++;
    +                        seed = 0;
    +                        while ((*argument >= '0') && (*argument <= '9'))
    +                            seed *= 10, seed += (U32)(*argument++ - '0');
    +                        break;
    +                    case 'P':
    +                        argument++;
    +                        probaU32 = 0;
    +                        while ((*argument >= '0') && (*argument <= '9'))
    +                            probaU32 *= 10,
    +                                    probaU32 += (U32)(*argument++ - '0');
    +                        if (probaU32 > 100)
    +                            probaU32 = 100;
    +                        break;
    +                    case 'L': /* hidden argument : Literal distribution
    +                                 probability */
    +                        argument++;
    +                        litProba = 0.;
    +                        while ((*argument >= '0') && (*argument <= '9'))
    +                            litProba *= 10, litProba += *argument++ - '0';
    +                        if (litProba > 100.)
    +                            litProba = 100.;
    +                        litProba /= 100.;
    +                        break;
    +                    case 'v':
    +                        displayLevel = 4;
    +                        argument++;
    +                        break;
    +                    default:
    +                        return usage(programName);
                     }
    -    }   }   }   /* for(argNb=1; argNb (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) || offset == 0);
     
    -        {   BYTE* const dictEnd = info.dictContent + info.dictContentSize;
    +        {   BYTE* const dictEnd = ZSTD_maybeNullPtrAdd(info.dictContent, info.dictContentSize);
                 size_t j;
                 for (j = 0; j < matchLen; j++) {
                     if ((U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) < offset) {
    @@ -825,7 +825,7 @@ static size_t writeSequences(U32* seed, frame_t* frame, seqStore_t* seqStorePtr,
     
         /* Sequences Header */
         if ((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */) return ERROR(dstSize_tooSmall);
    -    if (nbSeq < 0x7F) *op++ = (BYTE)nbSeq;
    +    if (nbSeq < 128) *op++ = (BYTE)nbSeq;
         else if (nbSeq < LONGNBSEQ) op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
         else op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
     
    diff --git a/third-party/zstd/tests/fullbench.c b/third-party/zstd/tests/fullbench.c
    index 41bd26d0..c8f0c0af 100644
    --- a/third-party/zstd/tests/fullbench.c
    +++ b/third-party/zstd/tests/fullbench.c
    @@ -138,18 +138,24 @@ static size_t local_ZSTD_decompress(const void* src, size_t srcSize,
         return ZSTD_decompress(dst, dstSize, buff2, g_cSize);
     }
     
    -static ZSTD_DCtx* g_zdc = NULL;
    +static ZSTD_DCtx* g_zdc = NULL; /* will be initialized within benchMem */
    +static size_t local_ZSTD_decompressDCtx(const void* src, size_t srcSize,
    +                                    void* dst, size_t dstSize,
    +                                    void* buff2)
    +{
    +    (void)src; (void)srcSize;
    +    return ZSTD_decompressDCtx(g_zdc, dst, dstSize, buff2, g_cSize);
    +}
     
     #ifndef ZSTD_DLL_IMPORT
    -typedef enum {
    -    not_streaming = 0,
    -    is_streaming = 1
    -} streaming_operation;
    -extern size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* ctx, const void* src, size_t srcSize, void* dst, size_t dstCapacity, const streaming_operation streaming);
    +
    +extern size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
    +                          const void* src, size_t srcSize,
    +                          void* dst, size_t dstCapacity);
     static size_t local_ZSTD_decodeLiteralsBlock(const void* src, size_t srcSize, void* dst, size_t dstSize, void* buff2)
     {
         (void)src; (void)srcSize; (void)dst; (void)dstSize;
    -    return ZSTD_decodeLiteralsBlock(g_zdc, buff2, g_cSize, dst, dstSize, not_streaming);
    +    return ZSTD_decodeLiteralsBlock_wrapper(g_zdc, buff2, g_cSize, dst, dstSize);
     }
     
     static size_t local_ZSTD_decodeSeqHeaders(const void* src, size_t srcSize, void* dst, size_t dstSize, void* buff2)
    @@ -453,6 +459,9 @@ static int benchMem(unsigned benchNb,
         case 3:
             benchFunction = local_ZSTD_compress_freshCCtx; benchName = "compress_freshCCtx";
             break;
    +    case 4:
    +        benchFunction = local_ZSTD_decompressDCtx; benchName = "decompressDCtx";
    +        break;
     #ifndef ZSTD_DLL_IMPORT
         case 11:
             benchFunction = local_ZSTD_compressContinue; benchName = "compressContinue";
    @@ -552,6 +561,9 @@ static int benchMem(unsigned benchNb,
         case 3:
             payload = &cparams;
             break;
    +    case 4:
    +        g_cSize = ZSTD_compress(dstBuff2, dstBuffSize, src, srcSize, cLevel);
    +        break;
     #ifndef ZSTD_DLL_IMPORT
         case 11:
             payload = &cparams;
    @@ -606,7 +618,7 @@ static int benchMem(unsigned benchNb,
                 ip += ZSTD_blockHeaderSize;    /* skip block header */
                 ZSTD_decompressBegin(g_zdc);
                 CONTROL(iend > ip);
    -            ip += ZSTD_decodeLiteralsBlock(g_zdc, ip, (size_t)(iend-ip), dstBuff, dstBuffSize, not_streaming);   /* skip literal segment */
    +            ip += ZSTD_decodeLiteralsBlock_wrapper(g_zdc, ip, (size_t)(iend-ip), dstBuff, dstBuffSize);   /* skip literal segment */
                 g_cSize = (size_t)(iend-ip);
                 memcpy(dstBuff2, ip, g_cSize);   /* copy rest of block (it starts by SeqHeader) */
                 srcSize = srcSize > 128 KB ? 128 KB : srcSize;   /* speed relative to block */
    diff --git a/third-party/zstd/tests/fuzz/Makefile b/third-party/zstd/tests/fuzz/Makefile
    index 525e396b..430f6df1 100644
    --- a/third-party/zstd/tests/fuzz/Makefile
    +++ b/third-party/zstd/tests/fuzz/Makefile
    @@ -24,13 +24,12 @@ else
     endif
     CORPORA_URL_PREFIX:=https://github.com/facebook/zstd/releases/download/fuzz-corpora/
     
    -LIBZSTD = ../../lib
    +LIBZSTD_MK_DIR = ../../lib
     DEBUGLEVEL ?= 2
     ZSTD_LEGACY_SUPPORT ?= 1
     
    -include $(LIBZSTD)/libzstd.mk
    +include $(LIBZSTD_MK_DIR)/libzstd.mk
     
    -ZSTDDIR = ../../lib
     PRGDIR = ../../programs
     CONTRIBDIR = ../../contrib
     
    @@ -38,8 +37,8 @@ DEFAULT_SEQ_PROD_DIR = $(CONTRIBDIR)/externalSequenceProducer
     DEFAULT_SEQ_PROD_SRC = $(DEFAULT_SEQ_PROD_DIR)/sequence_producer.c
     THIRD_PARTY_SEQ_PROD_OBJ ?=
     
    -FUZZ_CPPFLAGS := -I$(ZSTDDIR) -I$(ZSTDDIR)/common -I$(ZSTDDIR)/compress \
    -	-I$(ZSTDDIR)/dictBuilder -I$(ZSTDDIR)/deprecated -I$(ZSTDDIR)/legacy \
    +FUZZ_CPPFLAGS := -I$(LIB_SRCDIR) -I$(LIB_SRCDIR)/common -I$(LIB_SRCDIR)/compress \
    +	-I$(LIB_SRCDIR)/dictBuilder -I$(LIB_SRCDIR)/deprecated -I$(LIB_SRCDIR)/legacy \
     	-I$(CONTRIBDIR)/seekable_format -I$(PRGDIR) -I$(DEFAULT_SEQ_PROD_DIR) \
     	-DZSTD_MULTITHREAD -DZSTD_LEGACY_SUPPORT=1 $(CPPFLAGS)
     FUZZ_EXTRA_FLAGS := -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
    @@ -78,11 +77,11 @@ FUZZ_SRC       := \
     	$(DEFAULT_SEQ_PROD_SRC)
     FUZZ_SRC := $(sort $(wildcard $(FUZZ_SRC)))
     
    -FUZZ_D_OBJ1 := $(subst $(ZSTDDIR)/common/,d_lib_common_,$(FUZZ_SRC))
    -FUZZ_D_OBJ2 := $(subst $(ZSTDDIR)/compress/,d_lib_compress_,$(FUZZ_D_OBJ1))
    -FUZZ_D_OBJ3 := $(subst $(ZSTDDIR)/decompress/,d_lib_decompress_,$(FUZZ_D_OBJ2))
    -FUZZ_D_OBJ4 := $(subst $(ZSTDDIR)/dictBuilder/,d_lib_dictBuilder_,$(FUZZ_D_OBJ3))
    -FUZZ_D_OBJ5 := $(subst $(ZSTDDIR)/legacy/,d_lib_legacy_,$(FUZZ_D_OBJ4))
    +FUZZ_D_OBJ1 := $(subst $(LIB_SRCDIR)/common/,d_lib_common_,$(FUZZ_SRC))
    +FUZZ_D_OBJ2 := $(subst $(LIB_SRCDIR)/compress/,d_lib_compress_,$(FUZZ_D_OBJ1))
    +FUZZ_D_OBJ3 := $(subst $(LIB_SRCDIR)/decompress/,d_lib_decompress_,$(FUZZ_D_OBJ2))
    +FUZZ_D_OBJ4 := $(subst $(LIB_SRCDIR)/dictBuilder/,d_lib_dictBuilder_,$(FUZZ_D_OBJ3))
    +FUZZ_D_OBJ5 := $(subst $(LIB_SRCDIR)/legacy/,d_lib_legacy_,$(FUZZ_D_OBJ4))
     FUZZ_D_OBJ6 := $(subst $(PRGDIR)/,d_prg_,$(FUZZ_D_OBJ5))
     FUZZ_D_OBJ7 := $(subst $(DEFAULT_SEQ_PROD_DIR)/,d_default_seq_prod_,$(FUZZ_D_OBJ6))
     FUZZ_D_OBJ8 := $(subst $\./,d_fuzz_,$(FUZZ_D_OBJ7))
    @@ -90,11 +89,11 @@ FUZZ_D_OBJ9 := $(FUZZ_D_OBJ8:.c=.o)
     FUZZ_D_OBJ10 := $(THIRD_PARTY_SEQ_PROD_OBJ) $(FUZZ_D_OBJ9)
     FUZZ_DECOMPRESS_OBJ := $(FUZZ_D_OBJ10:.S=.o)
     
    -FUZZ_RT_OBJ1 := $(subst $(ZSTDDIR)/common/,rt_lib_common_,$(FUZZ_SRC))
    -FUZZ_RT_OBJ2 := $(subst $(ZSTDDIR)/compress/,rt_lib_compress_,$(FUZZ_RT_OBJ1))
    -FUZZ_RT_OBJ3 := $(subst $(ZSTDDIR)/decompress/,rt_lib_decompress_,$(FUZZ_RT_OBJ2))
    -FUZZ_RT_OBJ4 := $(subst $(ZSTDDIR)/dictBuilder/,rt_lib_dictBuilder_,$(FUZZ_RT_OBJ3))
    -FUZZ_RT_OBJ5 := $(subst $(ZSTDDIR)/legacy/,rt_lib_legacy_,$(FUZZ_RT_OBJ4))
    +FUZZ_RT_OBJ1 := $(subst $(LIB_SRCDIR)/common/,rt_lib_common_,$(FUZZ_SRC))
    +FUZZ_RT_OBJ2 := $(subst $(LIB_SRCDIR)/compress/,rt_lib_compress_,$(FUZZ_RT_OBJ1))
    +FUZZ_RT_OBJ3 := $(subst $(LIB_SRCDIR)/decompress/,rt_lib_decompress_,$(FUZZ_RT_OBJ2))
    +FUZZ_RT_OBJ4 := $(subst $(LIB_SRCDIR)/dictBuilder/,rt_lib_dictBuilder_,$(FUZZ_RT_OBJ3))
    +FUZZ_RT_OBJ5 := $(subst $(LIB_SRCDIR)/legacy/,rt_lib_legacy_,$(FUZZ_RT_OBJ4))
     FUZZ_RT_OBJ6 := $(subst $(PRGDIR)/,rt_prg_,$(FUZZ_RT_OBJ5))
     FUZZ_RT_OBJ7 := $(subst $(DEFAULT_SEQ_PROD_DIR)/,rt_default_seq_prod_,$(FUZZ_RT_OBJ6))
     FUZZ_RT_OBJ8 := $(subst $\./,rt_fuzz_,$(FUZZ_RT_OBJ7))
    @@ -125,26 +124,28 @@ FUZZ_TARGETS :=       \
     	sequence_compression_api \
     	seekable_roundtrip \
     	huf_round_trip \
    -	huf_decompress
    +	huf_decompress \
    +	decompress_cross_format \
    +	generate_sequences
     
     all: libregression.a $(FUZZ_TARGETS)
     
    -rt_lib_common_%.o: $(ZSTDDIR)/common/%.c
    +rt_lib_common_%.o: $(LIB_SRCDIR)/common/%.c
     	$(CC) $(FUZZ_CPPFLAGS) $(FUZZ_CFLAGS) $(FUZZ_ROUND_TRIP_FLAGS) $< -c -o $@
     
    -rt_lib_compress_%.o: $(ZSTDDIR)/compress/%.c
    +rt_lib_compress_%.o: $(LIB_SRCDIR)/compress/%.c
     	$(CC) $(FUZZ_CPPFLAGS) $(FUZZ_CFLAGS) $(FUZZ_ROUND_TRIP_FLAGS) $< -c -o $@
     
    -rt_lib_decompress_%.o: $(ZSTDDIR)/decompress/%.c
    +rt_lib_decompress_%.o: $(LIB_SRCDIR)/decompress/%.c
     	$(CC) $(FUZZ_CPPFLAGS) $(FUZZ_CFLAGS) $(FUZZ_ROUND_TRIP_FLAGS) $< -c -o $@
     
    -rt_lib_decompress_%.o: $(ZSTDDIR)/decompress/%.S
    +rt_lib_decompress_%.o: $(LIB_SRCDIR)/decompress/%.S
     	$(CC) $(FUZZ_CPPFLAGS) $(FUZZ_ASFLAGS) $(FUZZ_ROUND_TRIP_FLAGS) $< -c -o $@
     
    -rt_lib_dictBuilder_%.o: $(ZSTDDIR)/dictBuilder/%.c
    +rt_lib_dictBuilder_%.o: $(LIB_SRCDIR)/dictBuilder/%.c
     	$(CC) $(FUZZ_CPPFLAGS) $(FUZZ_CFLAGS) $(FUZZ_ROUND_TRIP_FLAGS) $< -c -o $@
     
    -rt_lib_legacy_%.o: $(ZSTDDIR)/legacy/%.c
    +rt_lib_legacy_%.o: $(LIB_SRCDIR)/legacy/%.c
     	$(CC) $(FUZZ_CPPFLAGS) $(FUZZ_CFLAGS) $(FUZZ_ROUND_TRIP_FLAGS) $< -c -o $@
     
     rt_prg_%.o: $(PRGDIR)/%.c
    @@ -156,22 +157,22 @@ rt_fuzz_%.o: %.c
     rt_default_seq_prod_%.o: $(DEFAULT_SEQ_PROD_DIR)/%.c
     	$(CC) $(FUZZ_CPPFLAGS) $(FUZZ_CFLAGS) $(FUZZ_ROUND_TRIP_FLAGS) $< -c -o $@
     
    -d_lib_common_%.o: $(ZSTDDIR)/common/%.c
    +d_lib_common_%.o: $(LIB_SRCDIR)/common/%.c
     	$(CC) $(FUZZ_CPPFLAGS) $(FUZZ_CFLAGS) $< -c -o $@
     
    -d_lib_compress_%.o: $(ZSTDDIR)/compress/%.c
    +d_lib_compress_%.o: $(LIB_SRCDIR)/compress/%.c
     	$(CC) $(FUZZ_CPPFLAGS) $(FUZZ_CFLAGS) $< -c -o $@
     
    -d_lib_decompress_%.o: $(ZSTDDIR)/decompress/%.c
    +d_lib_decompress_%.o: $(LIB_SRCDIR)/decompress/%.c
     	$(CC) $(FUZZ_CPPFLAGS) $(FUZZ_CFLAGS) $< -c -o $@
     
    -d_lib_decompress_%.o: $(ZSTDDIR)/decompress/%.S
    +d_lib_decompress_%.o: $(LIB_SRCDIR)/decompress/%.S
     	$(CC) $(FUZZ_CPPFLAGS) $(FUZZ_ASFLAGS) $< -c -o $@
     
    -d_lib_dictBuilder_%.o: $(ZSTDDIR)/dictBuilder/%.c
    +d_lib_dictBuilder_%.o: $(LIB_SRCDIR)/dictBuilder/%.c
     	$(CC) $(FUZZ_CPPFLAGS) $(FUZZ_CFLAGS) $< -c -o $@
     
    -d_lib_legacy_%.o: $(ZSTDDIR)/legacy/%.c
    +d_lib_legacy_%.o: $(LIB_SRCDIR)/legacy/%.c
     	$(CC) $(FUZZ_CPPFLAGS) $(FUZZ_CFLAGS) $< -c -o $@
     
     d_prg_%.o: $(PRGDIR)/%.c
    @@ -240,6 +241,12 @@ huf_round_trip: $(FUZZ_HEADERS) $(FUZZ_ROUND_TRIP_OBJ) rt_fuzz_huf_round_trip.o
     huf_decompress: $(FUZZ_HEADERS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_huf_decompress.o
     	$(CXX) $(FUZZ_TARGET_FLAGS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_huf_decompress.o $(LIB_FUZZING_ENGINE) -o $@
     
    +decompress_cross_format: $(FUZZ_HEADERS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_decompress_cross_format.o
    +	$(CXX) $(FUZZ_TARGET_FLAGS) $(FUZZ_DECOMPRESS_OBJ) d_fuzz_decompress_cross_format.o $(LIB_FUZZING_ENGINE) -o $@
    +
    +generate_sequences: $(FUZZ_HEADERS) $(FUZZ_ROUND_TRIP_OBJ) rt_fuzz_generate_sequences.o
    +	$(CXX) $(FUZZ_TARGET_FLAGS) $(FUZZ_ROUND_TRIP_OBJ) rt_fuzz_generate_sequences.o $(LIB_FUZZING_ENGINE) -o $@
    +
     libregression.a: $(FUZZ_HEADERS) $(PRGDIR)/util.h $(PRGDIR)/util.c d_fuzz_regression_driver.o
     	$(AR) $(FUZZ_ARFLAGS) $@ d_fuzz_regression_driver.o
     
    @@ -257,7 +264,7 @@ corpora: $(patsubst %,corpora/%,$(FUZZ_TARGETS))
     seedcorpora: $(patsubst %,corpora/%_seed_corpus.zip,$(FUZZ_TARGETS))
     
     regressiontest: corpora
    -	CC="$(CC)" CXX="$(CXX)" CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" LDFLAGS="$(LDFLAGS)" $(PYTHON) ./fuzz.py build all
    +	CC="$(CC)" CXX="$(CXX)" CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" LDFLAGS="$(LDFLAGS)" $(PYTHON) ./fuzz.py build all --debug=$(DEBUGLEVEL)
     	$(PYTHON) ./fuzz.py regression all
     
     clean:
    diff --git a/third-party/zstd/tests/fuzz/README.md b/third-party/zstd/tests/fuzz/README.md
    index 2a9bd457..e2196e83 100644
    --- a/third-party/zstd/tests/fuzz/README.md
    +++ b/third-party/zstd/tests/fuzz/README.md
    @@ -117,3 +117,45 @@ CC=clang CXX=clang++ ./fuzz.py build all --enable-msan
     ## Fuzzing a custom sequence producer plugin
     Sequence producer plugin authors can use the zstd fuzzers to stress-test their code.
     See the documentation in `fuzz_third_party_seq_prod.h` for details.
    +
    +## Adding a new fuzzer
    +There are several steps involved in adding a new fuzzer harness.
    +
    +### Build your harness
    +1. Create a new your fuzzer harness `tests/fuzz/your_harness.c`.
    +
    +2. Add your harness to the Makefile
    +
    +    2.1 Follow [this example](https://github.com/facebook/zstd/blob/e124e39301381de8f323436a3e4c46539747ba24/tests/fuzz/Makefile#L216) if your fuzzer requires both compression and decompression symbols (prefix `rt_`). If your fuzzer only requires decompression symbols, follow [this example](https://github.com/facebook/zstd/blob/6a0052a409e2604bd40354b76b86272b712edd7d/tests/fuzz/Makefile#L194) (prefix `d_`).
    +    
    +    2.2 Add your target to [`FUZZ_TARGETS`](https://github.com/facebook/zstd/blob/6a0052a409e2604bd40354b76b86272b712edd7d/tests/fuzz/Makefile#L108).
    +    
    +3. Add your harness to [`fuzz.py`](https://github.com/facebook/zstd/blob/6a0052a409e2604bd40354b76b86272b712edd7d/tests/fuzz/fuzz.py#L48).
    +
    +### Generate seed data
    +Follow the instructions above to generate seed data:
    +```
    +make -C ../tests decodecorpus
    +./fuzz.py gen your_harness
    +```
    +
    +### Run the harness
    +Follow the instructions above to run your harness and fix any crashes:
    +```
    +./fuzz.py build your_harness --enable-fuzzer --enable-asan --enable-ubsan --cc clang --cxx clang++
    +./fuzz.py libfuzzer your_harness
    +```
    +
    +### Minimize and zip the corpus
    +After running the fuzzer for a while, you will have a large corpus at `tests/fuzz/corpora/your_harness*`.
    +This corpus must be minimized and zipped before uploading to GitHub for regression testing:
    +```
    +./fuzz.py minimize your_harness
    +./fuzz.py zip your_harness 
    +```
    +
    +### Upload the zip file to GitHub
    +The previous step should produce a `.zip` file containing the corpus for your new harness.
    +This corpus must be uploaded to GitHub here: https://github.com/facebook/zstd/releases/tag/fuzz-corpora
    +
    +
    diff --git a/third-party/zstd/tests/fuzz/decompress_cross_format.c b/third-party/zstd/tests/fuzz/decompress_cross_format.c
    new file mode 100644
    index 00000000..da10702a
    --- /dev/null
    +++ b/third-party/zstd/tests/fuzz/decompress_cross_format.c
    @@ -0,0 +1,130 @@
    +/*
    + * Copyright (c) Meta Platforms, Inc. and affiliates.
    + * All rights reserved.
    + *
    + * This source code is licensed under both the BSD-style license (found in the
    + * LICENSE file in the root directory of this source tree) and the GPLv2 (found
    + * in the COPYING file in the root directory of this source tree).
    + * You may select, at your option, one of the above-listed licenses.
    + */
    +
    +// This fuzz target validates decompression of magicless-format compressed data.
    +
    +#include 
    +#include 
    +#include 
    +#include 
    +#include "fuzz_helpers.h"
    +#define ZSTD_STATIC_LINKING_ONLY
    +#include "zstd.h"
    +#include "fuzz_data_producer.h"
    +
    +static ZSTD_DCtx *dctx = NULL;
    +
    +int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size)
    +{
    +    // Give a random portion of src data to the producer, to use for parameter generation.
    +    // The rest will be interpreted as magicless compressed data.
    +    FUZZ_dataProducer_t *producer = FUZZ_dataProducer_create(src, size);
    +    size_t magiclessSize = FUZZ_dataProducer_reserveDataPrefix(producer);
    +    const uint8_t* const magiclessSrc = src;
    +    size_t const dstSize = FUZZ_dataProducer_uint32Range(producer, 0, 10 * size);
    +    uint8_t* const standardDst = (uint8_t*)FUZZ_malloc(dstSize);
    +    uint8_t* const magiclessDst = (uint8_t*)FUZZ_malloc(dstSize);
    +
    +    // Create standard-format src from magicless-format src
    +    const uint32_t zstd_magic = ZSTD_MAGICNUMBER;
    +    size_t standardSize = sizeof(zstd_magic) + magiclessSize;
    +    uint8_t* const standardSrc = (uint8_t*)FUZZ_malloc(standardSize);
    +    memcpy(standardSrc, &zstd_magic, sizeof(zstd_magic)); // assume fuzzing on little-endian machine
    +    memcpy(standardSrc + sizeof(zstd_magic), magiclessSrc, magiclessSize);
    +
    +    // Truncate to a single frame
    +    {
    +        const size_t standardFrameCompressedSize = ZSTD_findFrameCompressedSize(standardSrc, standardSize);
    +        if (ZSTD_isError(standardFrameCompressedSize)) {
    +            goto cleanup_and_return;
    +        }
    +        standardSize = standardFrameCompressedSize;
    +        magiclessSize = standardFrameCompressedSize - sizeof(zstd_magic);
    +    }
    +
    +    // Create DCtx if needed
    +    if (!dctx) {
    +        dctx = ZSTD_createDCtx();
    +        FUZZ_ASSERT(dctx);
    +    }
    +
    +    // Test one-shot decompression
    +    {
    +        FUZZ_ZASSERT(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_and_parameters));
    +        FUZZ_ZASSERT(ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1));
    +        const size_t standardRet = ZSTD_decompressDCtx(
    +                                        dctx, standardDst, dstSize, standardSrc, standardSize);
    +
    +        FUZZ_ZASSERT(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_and_parameters));
    +        FUZZ_ZASSERT(ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless));
    +        const size_t magiclessRet = ZSTD_decompressDCtx(
    +                                        dctx, magiclessDst, dstSize, magiclessSrc, magiclessSize);
    +
    +        // Standard accepts => magicless should accept
    +        if (!ZSTD_isError(standardRet)) FUZZ_ZASSERT(magiclessRet);
    +
    +        // Magicless accepts => standard should accept
    +        // NOTE: this is nice-to-have, please disable this check if it is difficult to satisfy.
    +        if (!ZSTD_isError(magiclessRet)) FUZZ_ZASSERT(standardRet);
    +
    +        // If both accept, decompressed size and data should match
    +        if (!ZSTD_isError(standardRet) && !ZSTD_isError(magiclessRet)) {
    +            FUZZ_ASSERT(standardRet == magiclessRet);
    +            if (standardRet > 0) {
    +                FUZZ_ASSERT(
    +                    memcmp(standardDst, magiclessDst, standardRet) == 0
    +                );
    +            }
    +        }
    +    }
    +
    +    // Test streaming decompression
    +    {
    +        ZSTD_inBuffer standardIn = { standardSrc, standardSize, 0 };
    +        ZSTD_inBuffer magiclessIn = { magiclessSrc, magiclessSize, 0 };
    +        ZSTD_outBuffer standardOut = { standardDst, dstSize, 0 };
    +        ZSTD_outBuffer magiclessOut = { magiclessDst, dstSize, 0 };
    +
    +        FUZZ_ZASSERT(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_and_parameters));
    +        FUZZ_ZASSERT(ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1));
    +        const size_t standardRet = ZSTD_decompressStream(dctx, &standardOut, &standardIn);
    +
    +        FUZZ_ZASSERT(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_and_parameters));
    +        FUZZ_ZASSERT(ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless));
    +        const size_t magiclessRet = ZSTD_decompressStream(dctx, &magiclessOut, &magiclessIn);
    +
    +        // Standard accepts => magicless should accept
    +        if (standardRet == 0) FUZZ_ASSERT(magiclessRet == 0);
    +
    +        // Magicless accepts => standard should accept
    +        // NOTE: this is nice-to-have, please disable this check if it is difficult to satisfy.
    +        if (magiclessRet == 0) FUZZ_ASSERT(standardRet == 0);
    +
    +        // If both accept, decompressed size and data should match
    +        if (standardRet == 0 && magiclessRet == 0) {
    +            FUZZ_ASSERT(standardOut.pos == magiclessOut.pos);
    +            if (standardOut.pos > 0) {
    +                FUZZ_ASSERT(
    +                    memcmp(standardOut.dst, magiclessOut.dst, standardOut.pos) == 0
    +                );
    +            }
    +        }
    +    }
    +
    +cleanup_and_return:
    +#ifndef STATEFUL_FUZZING
    +    ZSTD_freeDCtx(dctx); dctx = NULL;
    +#endif
    +    free(standardSrc);
    +    free(standardDst);
    +    free(magiclessDst);
    +    FUZZ_dataProducer_free(producer);
    +    return 0;
    +}
    diff --git a/third-party/zstd/tests/fuzz/dictionary_round_trip.c b/third-party/zstd/tests/fuzz/dictionary_round_trip.c
    index 06fdf24e..0470fbf5 100644
    --- a/third-party/zstd/tests/fuzz/dictionary_round_trip.c
    +++ b/third-party/zstd/tests/fuzz/dictionary_round_trip.c
    @@ -23,13 +23,13 @@
     #include "fuzz_data_producer.h"
     #include "fuzz_third_party_seq_prod.h"
     
    -static ZSTD_CCtx *cctx = NULL;
    -static ZSTD_DCtx *dctx = NULL;
    +static ZSTD_CCtx* cctx = NULL;
    +static ZSTD_DCtx* dctx = NULL;
     
    -static size_t roundTripTest(void *result, size_t resultCapacity,
    -                            void *compressed, size_t compressedCapacity,
    -                            const void *src, size_t srcSize,
    -                            FUZZ_dataProducer_t *producer)
    +static size_t roundTripTest(void* result, size_t resultCapacity,
    +                            void* compressed, size_t compressedCapacity,
    +                            const void* src, size_t srcSize,
    +                            FUZZ_dataProducer_t* producer)
     {
         ZSTD_dictContentType_e dictContentType = ZSTD_dct_auto;
         FUZZ_dict_t dict = FUZZ_train(src, srcSize, producer);
    diff --git a/third-party/zstd/tests/fuzz/fuzz.py b/third-party/zstd/tests/fuzz/fuzz.py
    index 8e0a9eaa..d59df926 100755
    --- a/third-party/zstd/tests/fuzz/fuzz.py
    +++ b/third-party/zstd/tests/fuzz/fuzz.py
    @@ -65,6 +65,8 @@ def __init__(self, input_type, frame_type=FrameType.ZSTD):
         'seekable_roundtrip': TargetInfo(InputType.RAW_DATA),
         'huf_round_trip': TargetInfo(InputType.RAW_DATA),
         'huf_decompress': TargetInfo(InputType.RAW_DATA),
    +    'decompress_cross_format': TargetInfo(InputType.RAW_DATA),
    +    'generate_sequences': TargetInfo(InputType.RAW_DATA),
     }
     TARGETS = list(TARGET_INFO.keys())
     ALL_TARGETS = TARGETS + ['all']
    @@ -250,10 +252,10 @@ def build_parser(args):
             action='store_true',
             help='Enable UBSAN')
         parser.add_argument(
    -        '--enable-ubsan-pointer-overflow',
    +        '--disable-ubsan-pointer-overflow',
             dest='ubsan_pointer_overflow',
    -        action='store_true',
    -        help='Enable UBSAN pointer overflow check (known failure)')
    +        action='store_false',
    +        help='Disable UBSAN pointer overflow check (known failure)')
         parser.add_argument(
             '--enable-msan', dest='msan', action='store_true', help='Enable MSAN')
         parser.add_argument(
    @@ -383,8 +385,6 @@ def build_parser(args):
             raise RuntimeError('MSAN may not be used with any other sanitizers')
         if args.msan_track_origins and not args.msan:
             raise RuntimeError('--enable-msan-track-origins requires MSAN')
    -    if args.ubsan_pointer_overflow and not args.ubsan:
    -        raise RuntimeError('--enable-ubsan-pointer-overflow requires UBSAN')
         if args.sanitize_recover and not args.sanitize:
             raise RuntimeError('--enable-sanitize-recover but no sanitizers used')
     
    @@ -407,7 +407,12 @@ def build(args):
         cxxflags = shlex.split(args.cxxflags)
         mflags = shlex.split(args.mflags)
         # Flags to be added to both cflags and cxxflags
    -    common_flags = []
    +    common_flags = [
    +        '-Werror',
    +        '-Wno-error=declaration-after-statement',
    +        '-Wno-error=c++-compat',
    +        '-Wno-error=deprecated' # C files are sometimes compiled with CXX
    +    ]
     
         cppflags += [
             '-DDEBUGLEVEL={}'.format(args.debug),
    @@ -494,6 +499,7 @@ def build(args):
         subprocess.check_call(clean_cmd)
         build_cmd = [
             'make',
    +        '-j',
             cc_str,
             cxx_str,
             cppflags_str,
    diff --git a/third-party/zstd/tests/fuzz/fuzz_data_producer.c b/third-party/zstd/tests/fuzz/fuzz_data_producer.c
    index bf846b68..056de3ee 100644
    --- a/third-party/zstd/tests/fuzz/fuzz_data_producer.c
    +++ b/third-party/zstd/tests/fuzz/fuzz_data_producer.c
    @@ -28,12 +28,12 @@ void FUZZ_dataProducer_free(FUZZ_dataProducer_t *producer) { free(producer); }
     
     uint32_t FUZZ_dataProducer_uint32Range(FUZZ_dataProducer_t *producer, uint32_t min,
                                       uint32_t max) {
    -    FUZZ_ASSERT(min <= max);
    -
         uint32_t range = max - min;
         uint32_t rolling = range;
         uint32_t result = 0;
     
    +    FUZZ_ASSERT(min <= max);
    +
         while (rolling > 0 && producer->size > 0) {
           uint8_t next = *(producer->data + producer->size - 1);
           producer->size -= 1;
    @@ -79,11 +79,11 @@ int FUZZ_dataProducer_empty(FUZZ_dataProducer_t *producer) {
     
     size_t FUZZ_dataProducer_contract(FUZZ_dataProducer_t *producer, size_t newSize)
     {
    -    newSize = newSize > producer->size ? producer->size : newSize;
    +    const size_t effectiveNewSize = newSize > producer->size ? producer->size : newSize;
     
    -    size_t remaining = producer->size - newSize;
    +    size_t remaining = producer->size - effectiveNewSize;
         producer->data = producer->data + remaining;
    -    producer->size = newSize;
    +    producer->size = effectiveNewSize;
         return remaining;
     }
     
    diff --git a/third-party/zstd/tests/fuzz/fuzz_third_party_seq_prod.h b/third-party/zstd/tests/fuzz/fuzz_third_party_seq_prod.h
    index f04ad31a..f0771e47 100644
    --- a/third-party/zstd/tests/fuzz/fuzz_third_party_seq_prod.h
    +++ b/third-party/zstd/tests/fuzz/fuzz_third_party_seq_prod.h
    @@ -52,7 +52,7 @@ extern "C" {
     size_t FUZZ_seqProdSetup(void);
     
     /* The fuzzer will call this function after each test-case. It should free
    - * resources aquired by FUZZ_seqProdSetup() to prevent leaks across test-cases.
    + * resources acquired by FUZZ_seqProdSetup() to prevent leaks across test-cases.
      *
      * The fuzzer will assert() that the return value is zero. To signal an error,
      * please return a non-zero value. */
    @@ -72,7 +72,7 @@ size_t FUZZ_seqProdTearDown(void);
     void* FUZZ_createSeqProdState(void);
     
     /* The fuzzer will call this function after each test-case. It should free any
    - * resources aquired by FUZZ_createSeqProdState().
    + * resources acquired by FUZZ_createSeqProdState().
      *
      * The fuzzer will assert() that the return value is zero. To signal an error,
      * please return a non-zero value. */
    diff --git a/third-party/zstd/tests/fuzz/generate_sequences.c b/third-party/zstd/tests/fuzz/generate_sequences.c
    new file mode 100644
    index 00000000..1cc57e84
    --- /dev/null
    +++ b/third-party/zstd/tests/fuzz/generate_sequences.c
    @@ -0,0 +1,88 @@
    +/*
    + * Copyright (c) Meta Platforms, Inc. and affiliates.
    + * All rights reserved.
    + *
    + * This source code is licensed under both the BSD-style license (found in the
    + * LICENSE file in the root directory of this source tree) and the GPLv2 (found
    + * in the COPYING file in the root directory of this source tree).
    + * You may select, at your option, one of the above-listed licenses.
    + */
    +
    +#define ZSTD_STATIC_LINKING_ONLY
    +
    +#include 
    +#include 
    +#include 
    +#include 
    +
    +#include "fuzz_data_producer.h"
    +#include "fuzz_helpers.h"
    +#include "zstd_helpers.h"
    +
    +/**
    + * This fuzz target ensures that ZSTD_generateSequences() does not crash and
    + * if it succeeds that ZSTD_compressSequences() round trips.
    + */
    +
    +static void testRoundTrip(ZSTD_CCtx* cctx, ZSTD_Sequence const* seqs, size_t nbSeqs, const void* src, size_t srcSize) {
    +  /* Compress the sequences with block delimiters */
    +  const size_t compressBound = ZSTD_compressBound(srcSize);
    +  void* dst = FUZZ_malloc(compressBound);
    +  FUZZ_ASSERT(dst);
    +
    +  size_t compressedSize = ZSTD_compressSequences(cctx, dst, compressBound, seqs, nbSeqs, src, srcSize);
    +  FUZZ_ZASSERT(compressedSize);
    +
    +  void* decompressed = FUZZ_malloc(srcSize);
    +  FUZZ_ASSERT(srcSize == 0 || decompressed);
    +  size_t decompressedSize = ZSTD_decompress(decompressed, srcSize, dst, compressedSize);
    +  FUZZ_ZASSERT(decompressedSize);
    +  FUZZ_ASSERT(decompressedSize == srcSize);
    +  if (srcSize != 0) {
    +    FUZZ_ASSERT(!memcmp(src, decompressed, srcSize));
    +  }
    +
    +  free(decompressed);
    +  free(dst);
    +}
    +
    +int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
    +
    +  FUZZ_dataProducer_t *producer = FUZZ_dataProducer_create(data, size);
    +  size = FUZZ_dataProducer_reserveDataPrefix(producer);
    +
    +  ZSTD_CCtx* cctx = ZSTD_createCCtx();
    +  FUZZ_ASSERT(cctx);
    +
    +  const size_t seqsCapacity = FUZZ_dataProducer_uint32Range(producer, 0, 2 * ZSTD_sequenceBound(size));
    +  ZSTD_Sequence* seqs = (ZSTD_Sequence*)FUZZ_malloc(sizeof(ZSTD_Sequence) * seqsCapacity);
    +  FUZZ_ASSERT(seqsCapacity == 0 || seqs);
    +
    +  FUZZ_setRandomParameters(cctx, size, producer);
    +  FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetCBlockSize, 0));
    +  FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers, 0));
    +
    +  const size_t nbSeqs = ZSTD_generateSequences(cctx, seqs, seqsCapacity, data, size);
    +  if (ZSTD_isError(nbSeqs)) {
    +    /* Allowed to error if the destination is too small */
    +    if (ZSTD_getErrorCode(nbSeqs) == ZSTD_error_dstSize_tooSmall) {
    +        FUZZ_ASSERT(seqsCapacity < ZSTD_sequenceBound(size));
    +    }
    +  } else {
    +    /* Ensure we round trip with and without block delimiters*/
    +
    +    FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_blockDelimiters, ZSTD_sf_explicitBlockDelimiters));
    +    testRoundTrip(cctx, seqs, nbSeqs, data, size);
    +
    +    const size_t nbMergedSeqs = ZSTD_mergeBlockDelimiters(seqs, nbSeqs);
    +    FUZZ_ASSERT(nbMergedSeqs <= nbSeqs);
    +    FUZZ_ZASSERT(ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only));
    +    FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_blockDelimiters, ZSTD_sf_noBlockDelimiters));
    +    testRoundTrip(cctx, seqs, nbMergedSeqs, data, size);
    +  }
    +
    +  free(seqs);
    +  ZSTD_freeCCtx(cctx);
    +  FUZZ_dataProducer_free(producer);
    +  return 0;
    +}
    diff --git a/third-party/zstd/tests/fuzz/regression_driver.c b/third-party/zstd/tests/fuzz/regression_driver.c
    index 550c65d8..26e2b6af 100644
    --- a/third-party/zstd/tests/fuzz/regression_driver.c
    +++ b/third-party/zstd/tests/fuzz/regression_driver.c
    @@ -44,11 +44,12 @@ int main(int argc, char const **argv) {
         fprintf(stderr, "WARNING: No files passed to %s\n", argv[0]);
       for (i = 0; i < files->tableSize; ++i) {
         char const *fileName = files->fileNames[i];
    -    DEBUGLOG(3, "Running %s", fileName);
         size_t const fileSize = UTIL_getFileSize(fileName);
         size_t readSize;
         FILE *file;
     
    +    DEBUGLOG(3, "Running %s", fileName);
    +
         /* Check that it is a regular file, and that the fileSize is valid.
          * If it is not a regular file, then it may have been deleted since we
          * constructed the list, so just skip it, but return an error exit code.
    diff --git a/third-party/zstd/tests/fuzz/sequence_compression_api.c b/third-party/zstd/tests/fuzz/sequence_compression_api.c
    index ede7080e..ec0106c1 100644
    --- a/third-party/zstd/tests/fuzz/sequence_compression_api.c
    +++ b/third-party/zstd/tests/fuzz/sequence_compression_api.c
    @@ -116,7 +116,7 @@ static size_t decodeSequences(void* dst, size_t nbSequences,
                     }
                 }
                 for (; j < matchLength; ++j) {
    -                op[j] = op[j - generatedSequences[i].offset];
    +                op[j] = op[(ptrdiff_t)(j - generatedSequences[i].offset)];
                 }
                 op += j;
                 FUZZ_ASSERT(generatedSequences[i].matchLength == j + k);
    diff --git a/third-party/zstd/tests/fuzz/simple_decompress.c b/third-party/zstd/tests/fuzz/simple_decompress.c
    index ce5f9f09..0dc9e5b7 100644
    --- a/third-party/zstd/tests/fuzz/simple_decompress.c
    +++ b/third-party/zstd/tests/fuzz/simple_decompress.c
    @@ -16,6 +16,9 @@
     #include 
     #include 
     #include 
    +
    +#define ZSTD_STATIC_LINKING_ONLY
    +
     #include "fuzz_helpers.h"
     #include "zstd.h"
     #include "fuzz_data_producer.h"
    @@ -34,11 +37,18 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size)
             FUZZ_ASSERT(dctx);
         }
     
    -    size_t const bufSize = FUZZ_dataProducer_uint32Range(producer, 0, 10 * size);
    -    void *rBuf = FUZZ_malloc(bufSize);
    -
    -    ZSTD_decompressDCtx(dctx, rBuf, bufSize, src, size);
    -    free(rBuf);
    +    {
    +        size_t const bufSize = FUZZ_dataProducer_uint32Range(producer, 0, 10 * size);
    +        void *rBuf = FUZZ_malloc(bufSize);
    +        size_t const dSize = ZSTD_decompressDCtx(dctx, rBuf, bufSize, src, size);
    +        if (!ZSTD_isError(dSize)) {
    +            /* If decompression was successful, the content size from the frame header(s) should be valid. */
    +            unsigned long long const expectedSize = ZSTD_findDecompressedSize(src, size);
    +            FUZZ_ASSERT(expectedSize != ZSTD_CONTENTSIZE_ERROR);
    +            FUZZ_ASSERT(expectedSize == ZSTD_CONTENTSIZE_UNKNOWN || expectedSize == dSize);
    +        }
    +        free(rBuf);
    +    }
     
         FUZZ_dataProducer_free(producer);
     
    diff --git a/third-party/zstd/tests/fuzz/simple_round_trip.c b/third-party/zstd/tests/fuzz/simple_round_trip.c
    index 8b123197..660092e6 100644
    --- a/third-party/zstd/tests/fuzz/simple_round_trip.c
    +++ b/third-party/zstd/tests/fuzz/simple_round_trip.c
    @@ -27,7 +27,7 @@
     static ZSTD_CCtx *cctx = NULL;
     static ZSTD_DCtx *dctx = NULL;
     
    -static size_t getDecompressionMargin(void const* compressed, size_t cSize, size_t srcSize, int hasSmallBlocks)
    +static size_t getDecompressionMargin(void const* compressed, size_t cSize, size_t srcSize, int hasSmallBlocks, int maxBlockSize)
     {
         size_t margin = ZSTD_decompressionMargin(compressed, cSize);
         if (!hasSmallBlocks) {
    @@ -37,7 +37,12 @@ static size_t getDecompressionMargin(void const* compressed, size_t cSize, size_
             ZSTD_frameHeader zfh;
             size_t marginM;
             FUZZ_ZASSERT(ZSTD_getFrameHeader(&zfh, compressed, cSize));
    -        marginM = ZSTD_DECOMPRESSION_MARGIN(srcSize, zfh.blockSizeMax);
    +        if (maxBlockSize == 0) {
    +            maxBlockSize = zfh.blockSizeMax;
    +        } else {
    +            maxBlockSize = MIN(maxBlockSize, (int)zfh.blockSizeMax);
    +        }
    +        marginM = ZSTD_DECOMPRESSION_MARGIN(srcSize, maxBlockSize);
             if (marginM < margin)
                 margin = marginM;
         }
    @@ -52,12 +57,14 @@ static size_t roundTripTest(void *result, size_t resultCapacity,
         size_t cSize;
         size_t dSize;
         int targetCBlockSize = 0;
    +    int maxBlockSize = 0;
         if (FUZZ_dataProducer_uint32Range(producer, 0, 1)) {
             size_t const remainingBytes = FUZZ_dataProducer_remainingBytes(producer);
             FUZZ_setRandomParameters(cctx, srcSize, producer);
             cSize = ZSTD_compress2(cctx, compressed, compressedCapacity, src, srcSize);
             FUZZ_ZASSERT(cSize);
             FUZZ_ZASSERT(ZSTD_CCtx_getParameter(cctx, ZSTD_c_targetCBlockSize, &targetCBlockSize));
    +        FUZZ_ZASSERT(ZSTD_CCtx_getParameter(cctx, ZSTD_c_maxBlockSize, &maxBlockSize));
             // Compress a second time and check for determinism
             {
                 size_t const cSize0 = cSize;
    @@ -83,13 +90,16 @@ static size_t roundTripTest(void *result, size_t resultCapacity,
                 FUZZ_ASSERT(XXH64(compressed, cSize, 0) == hash0);
             }
         }
    +    if (FUZZ_dataProducer_uint32Range(producer, 0, 1)) {
    +        FUZZ_ZASSERT(ZSTD_DCtx_setParameter(dctx, ZSTD_d_maxBlockSize, maxBlockSize));
    +    }
         dSize = ZSTD_decompressDCtx(dctx, result, resultCapacity, compressed, cSize);
         FUZZ_ZASSERT(dSize);
         FUZZ_ASSERT_MSG(dSize == srcSize, "Incorrect regenerated size");
         FUZZ_ASSERT_MSG(!FUZZ_memcmp(src, result, dSize), "Corruption!");
     
         {
    -        size_t margin = getDecompressionMargin(compressed, cSize, srcSize, targetCBlockSize);
    +        size_t margin = getDecompressionMargin(compressed, cSize, srcSize, targetCBlockSize, maxBlockSize);
             size_t const outputSize = srcSize + margin;
             char* const output = (char*)FUZZ_malloc(outputSize);
             char* const input = output + outputSize - cSize;
    diff --git a/third-party/zstd/tests/fuzz/stream_round_trip.c b/third-party/zstd/tests/fuzz/stream_round_trip.c
    index 7d277a85..6e340c81 100644
    --- a/third-party/zstd/tests/fuzz/stream_round_trip.c
    +++ b/third-party/zstd/tests/fuzz/stream_round_trip.c
    @@ -63,6 +63,8 @@ static size_t compress(uint8_t *dst, size_t capacity,
         size_t dstSize = 0;
         ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only);
         FUZZ_setRandomParameters(cctx, srcSize, producer);
    +    int maxBlockSize;
    +    FUZZ_ZASSERT(ZSTD_CCtx_getParameter(cctx, ZSTD_c_maxBlockSize, &maxBlockSize));
     
         while (srcSize > 0) {
             ZSTD_inBuffer in = makeInBuffer(&src, &srcSize, producer);
    @@ -93,6 +95,8 @@ static size_t compress(uint8_t *dst, size_t capacity,
                             if (FUZZ_dataProducer_uint32Range(producer, 0, 7) == 0) {
                                 size_t const remaining = in.size - in.pos;
                                 FUZZ_setRandomParameters(cctx, remaining, producer);
    +                            /* Always use the same maxBlockSize */
    +                            FUZZ_ZASSERT(ZSTD_CCtx_setParameter(cctx, ZSTD_c_maxBlockSize, maxBlockSize));
                             }
                             mode = -1;
                         }
    @@ -132,6 +136,23 @@ static size_t compress(uint8_t *dst, size_t capacity,
         return dstSize;
     }
     
    +static size_t decompress(void* dst, size_t dstCapacity, void const* src, size_t srcSize, FUZZ_dataProducer_t* producer)
    +{
    +    ZSTD_inBuffer in = {src, srcSize, 0};
    +    ZSTD_outBuffer out = {dst, dstCapacity, 0};
    +    int maxBlockSize;
    +    FUZZ_ZASSERT(ZSTD_CCtx_getParameter(cctx, ZSTD_c_maxBlockSize, &maxBlockSize));
    +    if (FUZZ_dataProducer_uint32Range(producer, 0, 1)) {
    +        FUZZ_ZASSERT(ZSTD_DCtx_setParameter(dctx, ZSTD_d_maxBlockSize, maxBlockSize));
    +    }
    +    while (in.pos < in.size) {
    +        size_t const ret = ZSTD_decompressStream(dctx, &out, &in);
    +        FUZZ_ZASSERT(ret);
    +        FUZZ_ASSERT(ret == 0);
    +    }
    +    return out.pos;
    +}
    +
     int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size)
     {
         FUZZ_SEQ_PROD_SETUP();
    @@ -163,8 +184,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size)
     
         {
             size_t const cSize = compress(cBuf, neededBufSize, src, size, producer);
    -        size_t const rSize =
    -            ZSTD_decompressDCtx(dctx, rBuf, neededBufSize, cBuf, cSize);
    +        size_t const rSize = decompress(rBuf, neededBufSize, cBuf, cSize, producer);
             FUZZ_ZASSERT(rSize);
             FUZZ_ASSERT_MSG(rSize == size, "Incorrect regenerated size");
             FUZZ_ASSERT_MSG(!FUZZ_memcmp(src, rBuf, size), "Corruption!");
    diff --git a/third-party/zstd/tests/fuzzer.c b/third-party/zstd/tests/fuzzer.c
    index 07ddfefd..f7bdae90 100644
    --- a/third-party/zstd/tests/fuzzer.c
    +++ b/third-party/zstd/tests/fuzzer.c
    @@ -328,7 +328,7 @@ static void FUZ_decodeSequences(BYTE* dst, ZSTD_Sequence* seqs, size_t seqsSize,
     
             if (seqs[i].offset != 0) {
                 for (j = 0; j < seqs[i].matchLength; ++j)
    -                dst[j] = dst[j - seqs[i].offset];
    +                dst[j] = dst[(ptrdiff_t)(j - seqs[i].offset)];
                 dst += seqs[i].matchLength;
                 src += seqs[i].matchLength;
                 size -= seqs[i].matchLength;
    @@ -376,7 +376,7 @@ static int threadPoolTests(void) {
     
         RDG_genBuffer(CNBuffer, CNBuffSize, 0.5, 0.5, 0);
     
    -    DISPLAYLEVEL(3, "thread pool test : threadPool re-use roundtrips: ");
    +    DISPLAYLEVEL(3, "thread pool test : threadPool reuse roundtrips: ");
         {
             ZSTD_CCtx* cctx = ZSTD_createCCtx();
             ZSTD_threadPool* pool = ZSTD_createThreadPool(kPoolNumThreads);
    @@ -531,7 +531,7 @@ static void test_decompressBound(unsigned tnb)
                 CHECK_EQ( ZSTD_flushStream(cctx, &out), 0 );
             }
             CHECK_EQ( ZSTD_endStream(cctx, &out), 0 );
    -        CHECK( ZSTD_decompressBound(outBuffer, out.pos) > 0x100000000LLU /* 4 GB */ );
    +        CHECK( ZSTD_decompressBound(outBuffer, out.pos) > 0x100000000ULL /* 4 GB */ );
             ZSTD_freeCCtx(cctx);
             free(outBuffer);
         }
    @@ -953,6 +953,25 @@ static int basicUnitTests(U32 const seed, double compressibility)
             ZSTD_freeCCtx(cctx);
         }
     
    +    DISPLAYLEVEL(3, "test%3i : maxBlockSize = 2K", testNb++);
    +    {
    +        ZSTD_CCtx* cctx = ZSTD_createCCtx();
    +        ZSTD_DCtx* dctx = ZSTD_createDCtx();
    +        CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1));
    +        CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_maxBlockSize, 2048));
    +        CHECK_Z(ZSTD_DCtx_setParameter(dctx, ZSTD_d_maxBlockSize, 2048));
    +
    +        cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, CNBuffSize);
    +        CHECK_Z(cSize);
    +        CHECK_Z(ZSTD_decompressDCtx(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize));
    +
    +        CHECK_Z(ZSTD_DCtx_setParameter(dctx, ZSTD_d_maxBlockSize, 1024));
    +        CHECK(ZSTD_isError(ZSTD_decompressDCtx(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize)));
    +
    +        ZSTD_freeDCtx(dctx);
    +        ZSTD_freeCCtx(cctx);
    +    }
    +
         DISPLAYLEVEL(3, "test%3i : ldm fill dict out-of-bounds check", testNb++);
         {
             ZSTD_CCtx* const cctx = ZSTD_createCCtx();
    @@ -1100,6 +1119,9 @@ static int basicUnitTests(U32 const seed, double compressibility)
             size_t const srcSize1 = kWindowSize / 2;
             size_t const srcSize2 = kWindowSize * 10;
     
    +        CHECK(cctx!=NULL);
    +        CHECK(dctx!=NULL);
    +        CHECK(dict!=NULL);
             if (CNBuffSize < dictSize) goto _output_error;
     
             RDG_genBuffer(dict, dictSize, 0.5, 0.5, seed);
    @@ -1121,6 +1143,7 @@ static int basicUnitTests(U32 const seed, double compressibility)
             cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, srcSize1);
             CHECK_Z(cSize);
             CHECK_Z(ZSTD_decompress_usingDict(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize, dict, dictSize));
    +
             cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, srcSize2);
             /* Streaming decompression to catch out of bounds offsets. */
             {
    @@ -1134,24 +1157,22 @@ static int basicUnitTests(U32 const seed, double compressibility)
             CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers, 2));
             /* Round trip once with a dictionary. */
             CHECK_Z(ZSTD_CCtx_refPrefix(cctx, dict, dictSize));
    -        {
    -            ZSTD_inBuffer in = {CNBuffer, srcSize1, 0};
    +        {   ZSTD_inBuffer in = {CNBuffer, srcSize1, 0};
                 ZSTD_outBuffer out = {compressedBuffer, compressedBufferSize, 0};
                 CHECK_Z(ZSTD_compressStream2(cctx, &out, &in, ZSTD_e_flush));
                 CHECK_Z(ZSTD_compressStream2(cctx, &out, &in, ZSTD_e_end));
                 cSize = out.pos;
             }
             CHECK_Z(ZSTD_decompress_usingDict(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize, dict, dictSize));
    -        {
    -            ZSTD_inBuffer in = {CNBuffer, srcSize2, 0};
    +
    +        {   ZSTD_inBuffer in = {CNBuffer, srcSize2, 0};
                 ZSTD_outBuffer out = {compressedBuffer, compressedBufferSize, 0};
                 CHECK_Z(ZSTD_compressStream2(cctx, &out, &in, ZSTD_e_flush));
                 CHECK_Z(ZSTD_compressStream2(cctx, &out, &in, ZSTD_e_end));
                 cSize = out.pos;
             }
             /* Streaming decompression to catch out of bounds offsets. */
    -        {
    -            ZSTD_inBuffer in = {compressedBuffer, cSize, 0};
    +        {   ZSTD_inBuffer in = {compressedBuffer, cSize, 0};
                 ZSTD_outBuffer out = {decodedBuffer, CNBuffSize, 0};
                 size_t const dSize = ZSTD_decompressStream(dctx, &out, &in);
                 CHECK_Z(dSize);
    @@ -1353,7 +1374,7 @@ static int basicUnitTests(U32 const seed, double compressibility)
         }
         DISPLAYLEVEL(3, "OK \n");
     
    -    DISPLAYLEVEL(3, "test%3d: superblock uncompressible data, too many nocompress superblocks : ", testNb++);
    +    DISPLAYLEVEL(3, "test%3d : superblock uncompressible data: too many nocompress superblocks : ", testNb++);
         {
             ZSTD_CCtx* const cctx = ZSTD_createCCtx();
             const BYTE* src = (BYTE*)CNBuffer; BYTE* dst = (BYTE*)compressedBuffer;
    @@ -1506,14 +1527,14 @@ static int basicUnitTests(U32 const seed, double compressibility)
         }
         DISPLAYLEVEL(3, "OK \n");
     
    -    DISPLAYLEVEL(3, "test%3d : re-use CCtx with expanding block size : ", testNb++);
    +    DISPLAYLEVEL(3, "test%3d : reuse CCtx with expanding block size : ", testNb++);
         {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
             ZSTD_parameters const params = ZSTD_getParams(1, ZSTD_CONTENTSIZE_UNKNOWN, 0);
             assert(params.fParams.contentSizeFlag == 1);  /* block size will be adapted if pledgedSrcSize is enabled */
             CHECK_Z( ZSTD_compressBegin_advanced(cctx, NULL, 0, params, 1 /*pledgedSrcSize*/) );
             CHECK_Z( ZSTD_compressEnd(cctx, compressedBuffer, compressedBufferSize, CNBuffer, 1) ); /* creates a block size of 1 */
     
    -        CHECK_Z( ZSTD_compressBegin_advanced(cctx, NULL, 0, params, ZSTD_CONTENTSIZE_UNKNOWN) );  /* re-use same parameters */
    +        CHECK_Z( ZSTD_compressBegin_advanced(cctx, NULL, 0, params, ZSTD_CONTENTSIZE_UNKNOWN) );  /* reuse same parameters */
             {   size_t const inSize = 2* 128 KB;
                 size_t const outSize = ZSTD_compressBound(inSize);
                 CHECK_Z( ZSTD_compressEnd(cctx, compressedBuffer, outSize, CNBuffer, inSize) );
    @@ -1808,7 +1829,7 @@ static int basicUnitTests(U32 const seed, double compressibility)
             params.cParams.windowLog = ZSTD_WINDOWLOG_MAX;
             for (cnb = 0; cnb < nbCompressions; ++cnb) {
                 DISPLAYLEVEL(6, "run %zu / %zu \n", cnb, nbCompressions);
    -            CHECK_Z( ZSTD_compressBegin_advanced(cctx, NULL, 0, params, ZSTD_CONTENTSIZE_UNKNOWN) );  /* re-use same parameters */
    +            CHECK_Z( ZSTD_compressBegin_advanced(cctx, NULL, 0, params, ZSTD_CONTENTSIZE_UNKNOWN) );  /* reuse same parameters */
                 CHECK_Z( ZSTD_compressEnd(cctx, compressedBuffer, compressedBufferSize, CNBuffer, CNBuffSize) );
             }
             ZSTD_freeCCtx(cctx);
    @@ -2407,6 +2428,14 @@ static int basicUnitTests(U32 const seed, double compressibility)
             }   }
             DISPLAYLEVEL(3, "OK \n");
     
    +#if !defined(ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR) \
    + && !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
    + && !defined(ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR) \
    + && !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
    + && !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
    + && !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
    + && !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
    + && !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
             /* Note : these tests should be replaced by proper regression tests,
              *         but existing ones do not focus on small data + dictionary + all levels.
              */
    @@ -2505,6 +2534,7 @@ static int basicUnitTests(U32 const seed, double compressibility)
     
                 DISPLAYLEVEL(4, "compression efficiency tests OK \n");
             }
    +#endif
     
             ZSTD_freeCCtx(ctxOrig);
             ZSTD_freeCCtx(ctxDuplicated);
    @@ -3656,11 +3686,13 @@ static int basicUnitTests(U32 const seed, double compressibility)
     
             /* Test with block delimiters roundtrip */
             seqsSize = ZSTD_generateSequences(cctx, seqs, srcSize, src, srcSize);
    +        CHECK_Z(seqsSize);
             FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize, ZSTD_sf_explicitBlockDelimiters);
             assert(!memcmp(CNBuffer, compressedBuffer, srcSize));
     
             /* Test no block delimiters roundtrip */
             seqsSize = ZSTD_mergeBlockDelimiters(seqs, seqsSize);
    +        CHECK_Z(seqsSize);
             FUZ_decodeSequences(decoded, seqs, seqsSize, src, srcSize, ZSTD_sf_noBlockDelimiters);
             assert(!memcmp(CNBuffer, compressedBuffer, srcSize));
     
    @@ -3669,6 +3701,31 @@ static int basicUnitTests(U32 const seed, double compressibility)
         }
         DISPLAYLEVEL(3, "OK \n");
     
    +    DISPLAYLEVEL(3, "test%3i : ZSTD_generateSequences too small output buffer : ", testNb++);
    +    {
    +        const size_t seqsCapacity = 10;
    +        const size_t srcSize = 150 KB;
    +        const BYTE* src = (BYTE*)CNBuffer;
    +
    +        ZSTD_CCtx* const cctx = ZSTD_createCCtx();
    +        ZSTD_Sequence* const seqs = (ZSTD_Sequence*)malloc(seqsCapacity * sizeof(ZSTD_Sequence));
    +
    +        if (seqs == NULL) goto _output_error;
    +        if (cctx == NULL) goto _output_error;
    +        /* Populate src with random data */
    +        RDG_genBuffer(CNBuffer, srcSize, compressibility, 0.5, seed);
    +
    +        /* Test with block delimiters roundtrip */
    +        {
    +            size_t const seqsSize = ZSTD_generateSequences(cctx, seqs, seqsCapacity, src, srcSize);
    +            if (!ZSTD_isError(seqsSize)) goto _output_error;
    +        }
    +
    +        ZSTD_freeCCtx(cctx);
    +        free(seqs);
    +    }
    +    DISPLAYLEVEL(3, "OK \n");
    +
         DISPLAYLEVEL(3, "test%3i : ZSTD_getSequences followed by ZSTD_compressSequences : ", testNb++);
         {
             const size_t srcSize = 500 KB;
    diff --git a/third-party/zstd/tests/golden-decompression-errors/.gitignore b/third-party/zstd/tests/golden-decompression-errors/.gitignore
    new file mode 100644
    index 00000000..574b3750
    --- /dev/null
    +++ b/third-party/zstd/tests/golden-decompression-errors/.gitignore
    @@ -0,0 +1 @@
    +!*.zst
    diff --git a/third-party/zstd/tests/golden-decompression-errors/off0.bin.zst b/third-party/zstd/tests/golden-decompression-errors/off0.bin.zst
    new file mode 100644
    index 0000000000000000000000000000000000000000..13493fb336c6e3e339c1c224a1a2ada1a8b57a0b
    GIT binary patch
    literal 17
    YcmdPcs{faP!Igo5gMo=b-YwAz<>b*1`HT5V8DO@
    z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK
    zfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5
    zV8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM
    z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*
    z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd
    z0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwA
    zz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj
    zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r
    z3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@
    z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK
    mfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM{DB9JumnK>
    
    literal 0
    HcmV?d00001
    
    diff --git a/third-party/zstd/tests/golden-decompression/empty-block.zst b/third-party/zstd/tests/golden-decompression/empty-block.zst
    new file mode 100644
    index 0000000000000000000000000000000000000000..fbfb893e11eb677f1e6444ead8a5829a3a23e53e
    GIT binary patch
    literal 11
    QcmdPcs{faPL6iXq020jt9{>OV
    
    literal 0
    HcmV?d00001
    
    diff --git a/third-party/zstd/tests/golden-decompression/rle-first-block.zst b/third-party/zstd/tests/golden-decompression/rle-first-block.zst
    new file mode 100644
    index 0000000000000000000000000000000000000000..fd067edd74ef9bab1dcf9af83baa7fee24f73287
    GIT binary patch
    literal 45
    acmdPcs{eNh1A_nq6CTVAl>2BW_7DJtPX%=V
    
    literal 0
    HcmV?d00001
    
    diff --git a/third-party/zstd/tests/golden-decompression/zeroSeq_2B.zst b/third-party/zstd/tests/golden-decompression/zeroSeq_2B.zst
    new file mode 100644
    index 0000000000000000000000000000000000000000..f9f3520a6eb823709594cbe57df3c1b497984f48
    GIT binary patch
    literal 25
    gcmdPcs{faPp_PFl!y`2(Cto2vzbGd~k*k3L0BK 4 GB).
    + * Note that, beyond 1 paragraph, this generator produces
    + * a different content than LOREM_genBuffer (even when using same seed).
    + */
    +
    +#include "loremOut.h"
    +#include 
    +#include 
    +#include "lorem.h"    /* LOREM_genBlock */
    +#include "platform.h" /* Compiler options, SET_BINARY_MODE */
    +
    +#define MIN(a, b) ((a) < (b) ? (a) : (b))
    +#define LOREM_BLOCKSIZE (1 << 10)
    +void LOREM_genOut(unsigned long long size, unsigned seed)
    +{
    +    char buff[LOREM_BLOCKSIZE] = { 0 };
    +    unsigned long long total   = 0;
    +    size_t genBlockSize        = (size_t)MIN(size, LOREM_BLOCKSIZE);
    +
    +    /* init */
    +    SET_BINARY_MODE(stdout);
    +
    +    /* Generate Ipsum text, one paragraph at a time */
    +    while (total < size) {
    +        size_t generated =
    +                LOREM_genBlock(buff, genBlockSize, seed++, total == 0, 0);
    +        assert(generated <= genBlockSize);
    +        total += generated;
    +        assert(total <= size);
    +        fwrite(buff,
    +               1,
    +               generated,
    +               stdout); /* note: should check potential write error */
    +        if (size - total < genBlockSize)
    +            genBlockSize = (size_t)(size - total);
    +    }
    +    assert(total == size);
    +}
    diff --git a/third-party/zstd/tests/loremOut.h b/third-party/zstd/tests/loremOut.h
    new file mode 100644
    index 00000000..3a32e116
    --- /dev/null
    +++ b/third-party/zstd/tests/loremOut.h
    @@ -0,0 +1,15 @@
    +/*
    + * Copyright (c) Meta Platforms, Inc. and affiliates.
    + * All rights reserved.
    + *
    + * This source code is licensed under both the BSD-style license (found in the
    + * LICENSE file in the root directory of this source tree) and the GPLv2 (found
    + * in the COPYING file in the root directory of this source tree).
    + * You may select, at your option, one of the above-listed licenses.
    + */
    +
    +/* LOREM_genOut():
    + * Generate @size bytes of compressible data using lorem ipsum generator into
    + * stdout.
    + */
    +void LOREM_genOut(unsigned long long size, unsigned seed);
    diff --git a/third-party/zstd/tests/playTests.sh b/third-party/zstd/tests/playTests.sh
    index 5f595f61..e2a0694f 100755
    --- a/third-party/zstd/tests/playTests.sh
    +++ b/third-party/zstd/tests/playTests.sh
    @@ -1,6 +1,7 @@
     #!/bin/sh
     
    -set -e
    +set -e # exit immediately on error
    +# set -x # print commands before execution (debug)
     
     unset ZSTD_CLEVEL
     unset ZSTD_NBTHREADS
    @@ -16,18 +17,18 @@ datagen() {
     }
     
     zstd() {
    -    if [ -z "$EXEC_PREFIX" ]; then
    +    if [ -z "$EXE_PREFIX" ]; then
             "$ZSTD_BIN" "$@"
         else
    -        "$EXEC_PREFIX" "$ZSTD_BIN" "$@"
    +        "$EXE_PREFIX" "$ZSTD_BIN" "$@"
         fi
     }
     
     sudoZstd() {
    -    if [ -z "$EXEC_PREFIX" ]; then
    +    if [ -z "$EXE_PREFIX" ]; then
             sudo "$ZSTD_BIN" "$@"
         else
    -        sudo "$EXEC_PREFIX" "$ZSTD_BIN" "$@"
    +        sudo "$EXE_PREFIX" "$ZSTD_BIN" "$@"
         fi
     }
     
    @@ -91,7 +92,13 @@ fi
     SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
     PRGDIR="$SCRIPT_DIR/../programs"
     TESTDIR="$SCRIPT_DIR/../tests"
    -UNAME=$(uname)
    +UNAME=${UNAME:-$(uname)}
    +GREP=${GREP:-grep}
    +
    +case "$UNAME" in
    +  SunOS) DIFF=${DIFF:-gdiff} ;;
    +  *) DIFF=${DIFF:-diff} ;;
    +esac
     
     detectedTerminal=false
     if [ -t 0 ] && [ -t 1 ]
    @@ -151,11 +158,6 @@ assertSamePermissions() {
         [ "$STAT1" = "$STAT2" ] || die "permissions on $1 don't match those on $2 ($STAT1 != $STAT2)"
     }
     
    -DIFF="diff"
    -case "$UNAME" in
    -  SunOS) DIFF="gdiff" ;;
    -esac
    -
     
     # check if ZSTD_BIN is defined. if not, use the default value
     if [ -z "${ZSTD_BIN}" ]; then
    @@ -177,7 +179,7 @@ fi
     [ -n "$DATAGEN_BIN" ] || die "datagen not found at $DATAGEN_BIN! \n Please define DATAGEN_BIN pointing to the datagen binary. You might also consider rebuilding zstd tests following the instructions in README.md. "
     println "\nStarting playTests.sh isWindows=$isWindows EXE_PREFIX='$EXE_PREFIX' ZSTD_BIN='$ZSTD_BIN' DATAGEN_BIN='$DATAGEN_BIN'"
     
    -if echo hello | zstd -v -T2 2>&1 > $INTOVOID | grep -q 'multi-threading is disabled'
    +if echo hello | zstd -v -T2 2>&1 > $INTOVOID | $GREP -q 'multi-threading is disabled'
     then
         hasMT=""
     else
    @@ -232,12 +234,23 @@ unset ZSTD_CLEVEL
     println "test : compress to stdout"
     zstd tmp -c > tmpCompressed
     zstd tmp --stdout > tmpCompressed       # long command format
    -println "test : compress to named file"
    +
    +println "test : compress to named file (-o)"
     rm -f tmpCompressed
     zstd tmp -o tmpCompressed
     test -f tmpCompressed   # file must be created
    +
     println "test : force write, correct order"
     zstd tmp -fo tmpCompressed
    +
    +println "test : -c + -o : last one wins"
    +rm -f tmpOut
    +zstd tmp -c > tmpCompressed -o tmpOut
    +test -f tmpOut   # file must be created
    +rm -f tmpCompressed
    +zstd tmp -o tmpOut -c > tmpCompressed
    +test -f tmpCompressed   # file must be created
    +
     println "test : forgotten argument"
     cp tmp tmp2
     zstd tmp2 -fo && die "-o must be followed by filename "
    @@ -253,8 +266,8 @@ println "test : null-length file roundtrip"
     println -n '' | zstd - --stdout | zstd -d --stdout
     println "test : ensure small file doesn't add 3-bytes null block"
     datagen -g1 > tmp1
    -zstd tmp1 -c | wc -c | grep "14"
    -zstd < tmp1  | wc -c | grep "14"
    +zstd tmp1 -c | wc -c | $GREP "14"
    +zstd < tmp1  | wc -c | $GREP "14"
     println "test : decompress file with wrong suffix (must fail)"
     zstd -d tmpCompressed && die "wrong suffix error not detected!"
     zstd -df tmp && die "should have refused : wrong extension"
    @@ -291,9 +304,9 @@ println "test: --no-progress flag"
     zstd tmpro -c --no-progress | zstd -d -f -o "$INTOVOID" --no-progress
     zstd tmpro -cv --no-progress | zstd -dv -f -o "$INTOVOID" --no-progress
     println "test: --progress flag"
    -zstd tmpro -c | zstd -d -f -o "$INTOVOID" --progress 2>&1 | grep -E "[A-Za-z0-9._ ]+: [0-9]+ bytes"
    -zstd tmpro -c | zstd -d -f -q -o "$INTOVOID" --progress 2>&1 | grep -E "[A-Za-z0-9._ ]+: [0-9]+ bytes"
    -zstd tmpro -c | zstd -d -f -v -o "$INTOVOID" 2>&1 | grep -E "[A-Za-z0-9._ ]+: [0-9]+ bytes"
    +zstd tmpro -c | zstd -d -f -o "$INTOVOID" --progress 2>&1 | $GREP '[A-Za-z0-9._ ]*: [0-9]* bytes'
    +zstd tmpro -c | zstd -d -f -q -o "$INTOVOID" --progress 2>&1 | $GREP '[A-Za-z0-9._ ]*: [0-9]* bytes'
    +zstd tmpro -c | zstd -d -f -v -o "$INTOVOID" 2>&1 | $GREP '[A-Za-z0-9._ ]*: [0-9]* bytes'
     rm -f tmpro tmpro.zst
     println "test: overwrite input file (must fail)"
     zstd tmp -fo tmp && die "zstd compression overwrote the input file"
    @@ -320,10 +333,55 @@ zstd -d -f tmp.zst --no-check
     if [ "$isWindows" = false ] && [ "$UNAME" != "AIX" ]; then
       if [ -n "$(which readelf)" ]; then
         println "test: check if binary has executable stack (#2963)"
    -    readelf -lW "$ZSTD_BIN" | grep 'GNU_STACK .* RW ' || die "zstd binary has executable stack!"
    +    readelf -lW "$ZSTD_BIN" | $GREP 'GNU_STACK .* RW ' || die "zstd binary has executable stack!"
       fi
     fi
     
    +println "\n===>  multiple_thread test "
    +
    +datagen > tmp
    +println "test : single-thread "
    +zstd --fast --single-thread tmp -o tmpMT0
    +println "test : one worker thread (default)"
    +zstd --fast -T1 tmp -o tmpMT1
    +println "test : two worker threads "
    +zstd --fast -T2 tmp -o tmpMT2
    +println "test : 16-thread "
    +zstd --fast -T16 tmp -o tmpMT3
    +println "test : 127-thread "
    +zstd --fast -T127 tmp -o tmpMT4
    +println "test : 128-thread "
    +zstd --fast -T128 tmp -o tmpMT5
    +println "test : max allowed numeric value is 4294967295 "
    +zstd --fast -4294967295 tmp -o tmpMT6
    +println "test : numeric value overflows 32-bit unsigned int "
    +zstd --fast -4294967296 tmp -o tmptest9 && die "max allowed numeric value is 4294967295"
    +
    +datagen > tmp
    +println "test : basic compression "
    +zstd -f tmp  # trivial compression case, creates tmp.zst
    +println "test : basic decompression"
    +zstd -d -f -T1 tmp.zst
    +println "note : decompression does not support -T mode, but execution support"
    +rm -rf tmpMT*
    +
    +println "\n===>  --fast_argument test "
    +datagen > tmp
    +println "test : basic compression "
    +zstd -f tmp  # trivial compression case, creates tmp.zst
    +println "test: --fast=1"
    +zstd --fast=1 -f tmp
    +println "test: --fast=99"
    +zstd --fast=99 -f tmp
    +println "test: Invalid value -- negative number"
    +zstd --fast=-1 -f tmp && die "error: Invalid value -- negative number"
    +println "test: Invalid value -- zero"
    +zstd --fast=0 -f tmp && die "error: Invalid value -- 0 number"
    +println "test: max allowed numeric argument of --fast is 4294967295"
    +zstd --fast=4294967295 -f tmp
    +println "test: numeric value overflows 32-bit unsigned int "
    +zstd --fast=4294967296 -f tmp && die "max allowed argument of --fast is 4294967295"
    +
     println "\n===>  --exclude-compressed flag"
     rm -rf precompressedFilterTestDir
     mkdir -p precompressedFilterTestDir
    @@ -352,6 +410,19 @@ zstd --long --rm -r precompressedFilterTestDir
     # Files should get compressed again without the --exclude-compressed flag.
     test -f precompressedFilterTestDir/input.5.zst.zst
     test -f precompressedFilterTestDir/input.6.zst.zst
    +
    +# Test some other compressed file extensions
    +datagen $size > precompressedFilterTestDir/input.flac
    +datagen $size > precompressedFilterTestDir/input.mov
    +datagen $size > precompressedFilterTestDir/input.mp3
    +zstd --exclude-compressed --long --rm -r precompressedFilterTestDir
    +test ! -f precompressedFilterTestDir/input.flac.zst
    +test ! -f precompressedFilterTestDir/input.mov.zst
    +test ! -f precompressedFilterTestDir/input.mp3.zst
    +zstd --long --rm -r precompressedFilterTestDir
    +test -f precompressedFilterTestDir/input.flac.zst
    +test -f precompressedFilterTestDir/input.mov.zst
    +test -f precompressedFilterTestDir/input.mp3.zst
     rm -rf precompressedFilterTestDir
     println "Test completed"
     
    @@ -392,6 +463,8 @@ println "test: --rm is disabled when output is stdout"
     test -f tmp
     zstd --rm tmp -c > $INTOVOID
     test -f tmp # tmp shall still be there
    +zstd --rm tmp --stdout > $INTOVOID
    +test -f tmp # tmp shall still be there
     zstd -f --rm tmp -c > $INTOVOID
     test -f tmp # tmp shall still be there
     zstd -f tmp -c > $INTOVOID --rm
    @@ -409,13 +482,28 @@ zstd -f tmp tmp2 -o tmp3.zst --rm # just warns, no prompt
     test -f tmp
     test -f tmp2
     zstd -q tmp tmp2 -o tmp3.zst --rm && die "should refuse to concatenate"
    -
    +println "test: --rm is active with -o when single input"
    +rm -f tmp2.zst
    +zstd --rm tmp2 -o tmp2.zst
    +test -f tmp2.zst
    +test ! -f tmp2
    +println "test: -c followed by -o => -o wins, so --rm remains active" # (#3719)
    +rm tmp2.zst
    +cp tmp tmp2
    +zstd --rm tmp2 -c > $INTOVOID -o tmp2.zst
    +test ! -f tmp2
    +println "test: -o followed by -c => -c wins, so --rm is disabled" # (#3719)
    +rm tmp3.zst
    +cp tmp tmp2
    +zstd -v --rm tmp2 -o tmp2.zst -c > tmp3.zst
    +test -f tmp2
    +test -f tmp3.zst
     println "test : should quietly not remove non-regular file"
     println hello > tmp
     zstd tmp -f -o "$DEVDEVICE" 2>tmplog > "$INTOVOID"
    -grep "Refusing to remove non-regular file" tmplog && die
    +$GREP "Refusing to remove non-regular file" tmplog && die
     rm -f tmplog
    -zstd tmp -f -o "$INTOVOID" 2>&1 | grep "Refusing to remove non-regular file" && die
    +zstd tmp -f -o "$INTOVOID" 2>&1 | $GREP "Refusing to remove non-regular file" && die
     println "test : --rm on stdin"
     println a | zstd --rm > $INTOVOID   # --rm should remain silent
     rm -f tmp
    @@ -444,6 +532,11 @@ $DIFF -s tmp1 tmp
     touch tmp_empty
     zstd -d -o tmp2 "$TESTDIR/golden-decompression/empty-block.zst"
     $DIFF -s tmp2 tmp_empty
    +
    +zstd -t "$TESTDIR/golden-decompression/zeroSeq_2B.zst"
    +
    +zstd -t "$TESTDIR/golden-decompression-errors/zeroSeq_extraneous.zst" && die "invalid Sequences section should have been detected"
    +
     rm -f tmp*
     
     println "\n===>  compress multiple files"
    @@ -610,7 +703,7 @@ if [ -n "$DEVNULLRIGHTS" ] ; then
         zstd tmp -f -o tmp.zst
         sudoZstd -d tmp.zst -c > $INTOVOID
         sudoZstd -d tmp.zst -o $INTOVOID
    -    ls -las $INTOVOID | grep "rw-rw-rw-"
    +    ls -las $INTOVOID | $GREP "rw-rw-rw-"
     fi
     
     if [ -n "$READFROMBLOCKDEVICE" ] ; then
    @@ -620,7 +713,7 @@ if [ -n "$READFROMBLOCKDEVICE" ] ; then
         println "\n===> checking that zstd can read from a block device"
         datagen -g65536 > tmp.img
         sudo losetup -fP tmp.img
    -    LOOP_DEV=$(losetup -a | grep 'tmp\.img' | cut -f1 -d:)
    +    LOOP_DEV=$(losetup -a | $GREP 'tmp\.img' | cut -f1 -d:)
         [ -z "$LOOP_DEV" ] && die "failed to get loopback device"
         sudoZstd $LOOP_DEV -c > tmp.img.zst && die "should fail without -f"
         sudoZstd -f $LOOP_DEV -c > tmp.img.zst
    @@ -769,13 +862,13 @@ println "\n===> --[no-]content-size tests"
     
     datagen > tmp_contentsize
     zstd -f tmp_contentsize
    -zstd -lv tmp_contentsize.zst | grep "Decompressed Size:"
    +zstd -lv tmp_contentsize.zst | $GREP "Decompressed Size:"
     zstd -f --no-content-size tmp_contentsize
    -zstd -lv tmp_contentsize.zst | grep "Decompressed Size:" && die
    +zstd -lv tmp_contentsize.zst | $GREP "Decompressed Size:" && die
     zstd -f --content-size tmp_contentsize
    -zstd -lv tmp_contentsize.zst | grep "Decompressed Size:"
    +zstd -lv tmp_contentsize.zst | $GREP "Decompressed Size:"
     zstd -f --content-size --no-content-size tmp_contentsize
    -zstd -lv tmp_contentsize.zst | grep "Decompressed Size:" && die
    +zstd -lv tmp_contentsize.zst | $GREP "Decompressed Size:" && die
     rm -rf tmp*
     
     println "test : show-default-cparams regular"
    @@ -795,8 +888,7 @@ rm -rf tmp*
     println "test : show compression parameters in verbose mode"
     datagen > tmp
     zstd -vv tmp 2>&1 | \
    -grep -q -E -- "--zstd=wlog=[[:digit:]]+,clog=[[:digit:]]+,hlog=[[:digit:]]+,\
    -slog=[[:digit:]]+,mml=[[:digit:]]+,tlen=[[:digit:]]+,strat=[[:digit:]]+"
    +$GREP -q -- "--zstd=wlog=[0-9]*,clog=[0-9]*,hlog=[0-9]*,slog=[0-9]*,mml=[0-9]*,tlen=[0-9]*,strat=[0-9]*"
     rm -rf tmp*
     
     println "\n===>  Advanced compression parameters "
    @@ -1093,8 +1185,8 @@ println "- Test --memory for dictionary compression"
     datagen -g12M -P90 > tmpCorpusHighCompress
     zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=10K && die "Dictionary training should fail : --memory too low (10K)"
     zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=5MB 2> zstTrainWithMemLimitStdErr
    -cat zstTrainWithMemLimitStdErr | grep "setting manual memory limit for dictionary training data at 5 MB"
    -cat zstTrainWithMemLimitStdErr | grep "Training samples set too large (12 MB); training on 5 MB only..."
    +cat zstTrainWithMemLimitStdErr | $GREP "setting manual memory limit for dictionary training data at 5 MB"
    +cat zstTrainWithMemLimitStdErr | $GREP "Training samples set too large (12 MB); training on 5 MB only..."
     rm zstTrainWithMemLimitStdErr
     
     println "\n===>  fastCover dictionary builder : advanced options "
    @@ -1380,16 +1472,16 @@ println "\n===> suffix list test"
     ! zstd -d tmp.abc 2> tmplg
     
     if [ $GZIPMODE -ne 1 ]; then
    -    grep ".gz" tmplg > $INTOVOID && die "Unsupported suffix listed"
    +    $GREP ".gz" tmplg > $INTOVOID && die "Unsupported suffix listed"
     fi
     
     if [ $LZMAMODE -ne 1 ]; then
    -    grep ".lzma" tmplg > $INTOVOID && die "Unsupported suffix listed"
    -    grep ".xz" tmplg > $INTOVOID && die "Unsupported suffix listed"
    +    $GREP ".lzma" tmplg > $INTOVOID && die "Unsupported suffix listed"
    +    $GREP ".xz" tmplg > $INTOVOID && die "Unsupported suffix listed"
     fi
     
     if [ $LZ4MODE -ne 1 ]; then
    -    grep ".lz4" tmplg > $INTOVOID && die "Unsupported suffix listed"
    +    $GREP ".lz4" tmplg > $INTOVOID && die "Unsupported suffix listed"
     fi
     
     touch tmp1
    @@ -1518,7 +1610,7 @@ datagen > tmp2
     datagen > tmp3
     zstd tmp*
     zstd -l ./*.zst
    -zstd -lv ./*.zst | grep "Decompressed Size:"  # check that decompressed size is present in header
    +zstd -lv ./*.zst | $GREP "Decompressed Size:"  # check that decompressed size is present in header
     zstd --list ./*.zst
     zstd --list -v ./*.zst
     
    @@ -1561,13 +1653,13 @@ datagen -g0 > tmp5
     zstd tmp5
     zstd -l tmp5.zst
     zstd -l tmp5* && die "-l must fail on non-zstd file"
    -zstd -lv tmp5.zst | grep "Decompressed Size: 0 B (0 B)"  # check that 0 size is present in header
    +zstd -lv tmp5.zst | $GREP "Decompressed Size: 0 B (0 B)"  # check that 0 size is present in header
     zstd -lv tmp5* && die "-l must fail on non-zstd file"
     
     println "\n===>  zstd --list/-l test with no content size field "
     datagen -g513K | zstd > tmp6.zst
     zstd -l tmp6.zst
    -zstd -lv tmp6.zst | grep "Decompressed Size:"  && die "Field :Decompressed Size: should not be available in this compressed file"
    +zstd -lv tmp6.zst | $GREP "Decompressed Size:"  && die "Field :Decompressed Size: should not be available in this compressed file"
     
     println "\n===>   zstd --list/-l test with no checksum "
     zstd -f --no-check tmp1
    @@ -1602,22 +1694,24 @@ roundTripTest -g1M -P50 "1 --single-thread --long=29" " --long=28 --memory=512MB
     roundTripTest -g1M -P50 "1 --single-thread --long=29" " --zstd=wlog=28 --memory=512MB"
     
     
    -println "\n===>  zstd long distance matching with optimal parser compressed size tests "
    -optCSize16=$(datagen -g511K | zstd -16 -c | wc -c)
    -longCSize16=$(datagen -g511K | zstd -16 --long -c | wc -c)
    -optCSize19=$(datagen -g2M | zstd -19 -c | wc -c)
    -longCSize19=$(datagen -g2M | zstd -19 --long -c | wc -c)
    -optCSize19wlog23=$(datagen -g2M | zstd -19 -c  --zstd=wlog=23 | wc -c)
    -longCSize19wlog23=$(datagen -g2M | zstd -19 -c --long=23 | wc -c)
    -if [ "$longCSize16" -gt "$optCSize16" ]; then
    -    echo using --long on compression level 16 should not cause compressed size regression
    -    exit 1
    -elif [ "$longCSize19" -gt "$optCSize19" ]; then
    -    echo using --long on compression level 19 should not cause compressed size regression
    -    exit 1
    -elif [ "$longCSize19wlog23" -gt "$optCSize19wlog23" ]; then
    -    echo using --long on compression level 19 with wLog=23 should not cause compressed size regression
    -    exit 1
    +if [ "$ZSTD_LIB_EXCLUDE_COMPRESSORS_DFAST_AND_UP" -ne "1" ]; then
    +    println "\n===>  zstd long distance matching with optimal parser compressed size tests "
    +    optCSize16=$(datagen -g511K | zstd -16 -c | wc -c)
    +    longCSize16=$(datagen -g511K | zstd -16 --long -c | wc -c)
    +    optCSize19=$(datagen -g2M | zstd -19 -c | wc -c)
    +    longCSize19=$(datagen -g2M | zstd -19 --long -c | wc -c)
    +    optCSize19wlog23=$(datagen -g2M | zstd -19 -c  --zstd=wlog=23 | wc -c)
    +    longCSize19wlog23=$(datagen -g2M | zstd -19 -c --long=23 | wc -c)
    +    if [ "$longCSize16" -gt "$optCSize16" ]; then
    +        echo using --long on compression level 16 should not cause compressed size regression
    +        exit 1
    +    elif [ "$longCSize19" -gt "$optCSize19" ]; then
    +        echo using --long on compression level 19 should not cause compressed size regression
    +        exit 1
    +    elif [ "$longCSize19wlog23" -gt "$optCSize19wlog23" ]; then
    +        echo using --long on compression level 19 with wLog=23 should not cause compressed size regression
    +        exit 1
    +    fi
     fi
     
     println "\n===>  zstd asyncio tests "
    @@ -1708,9 +1802,15 @@ zstd --patch-from=tmp_dict -r tmp_dir && die
     rm -rf tmp*
     
     println "\n===> patch-from long mode trigger larger file test"
    -datagen -g5000000 > tmp_dict
    -datagen -g5000000 > tmp_patch
    -zstd -15 --patch-from=tmp_dict tmp_patch 2>&1 | grep "long mode automatically triggered"
    +if [ "$ZSTD_LIB_EXCLUDE_COMPRESSORS_DFAST_AND_UP" -eq "1" ]; then
    +    # if binary tree strategies are excluded, the threshold is different
    +    datagen -g10000000 > tmp_dict
    +    datagen -g10000000 > tmp_patch
    +else
    +    datagen -g5000000 > tmp_dict
    +    datagen -g5000000 > tmp_patch
    +fi
    +zstd -15 --patch-from=tmp_dict tmp_patch 2>&1 | $GREP "long mode automatically triggered"
     rm -rf tmp*
     
     println "\n===> patch-from very large dictionary and file test"
    diff --git a/third-party/zstd/tests/regression/results.csv b/third-party/zstd/tests/regression/results.csv
    index d072c0d8..fc3fbe7c 100644
    --- a/third-party/zstd/tests/regression/results.csv
    +++ b/third-party/zstd/tests/regression/results.csv
    @@ -11,10 +11,10 @@ silesia.tar,                        level 6,                            compress
     silesia.tar,                        level 7,                            compress simple,                    4579828
     silesia.tar,                        level 9,                            compress simple,                    4555448
     silesia.tar,                        level 13,                           compress simple,                    4502956
    -silesia.tar,                        level 16,                           compress simple,                    4360546
    -silesia.tar,                        level 19,                           compress simple,                    4265911
    +silesia.tar,                        level 16,                           compress simple,                    4360385
    +silesia.tar,                        level 19,                           compress simple,                    4260939
     silesia.tar,                        uncompressed literals,              compress simple,                    4854086
    -silesia.tar,                        uncompressed literals optimal,      compress simple,                    4265911
    +silesia.tar,                        uncompressed literals optimal,      compress simple,                    4260939
     silesia.tar,                        huffman literals,                   compress simple,                    6179047
     github.tar,                         level -5,                           compress simple,                    52115
     github.tar,                         level -3,                           compress simple,                    45678
    @@ -29,9 +29,9 @@ github.tar,                         level 7,                            compress
     github.tar,                         level 9,                            compress simple,                    36723
     github.tar,                         level 13,                           compress simple,                    35501
     github.tar,                         level 16,                           compress simple,                    40466
    -github.tar,                         level 19,                           compress simple,                    32276
    +github.tar,                         level 19,                           compress simple,                    32262
     github.tar,                         uncompressed literals,              compress simple,                    38831
    -github.tar,                         uncompressed literals optimal,      compress simple,                    32276
    +github.tar,                         uncompressed literals optimal,      compress simple,                    32262
     github.tar,                         huffman literals,                   compress simple,                    42560
     silesia,                            level -5,                           compress cctx,                      6857372
     silesia,                            level -3,                           compress cctx,                      6503412
    @@ -45,8 +45,8 @@ silesia,                            level 6,                            compress
     silesia,                            level 7,                            compress cctx,                      4570271
     silesia,                            level 9,                            compress cctx,                      4545850
     silesia,                            level 13,                           compress cctx,                      4493990
    -silesia,                            level 16,                           compress cctx,                      4360041
    -silesia,                            level 19,                           compress cctx,                      4296055
    +silesia,                            level 16,                           compress cctx,                      4359652
    +silesia,                            level 19,                           compress cctx,                      4266582
     silesia,                            long distance mode,                 compress cctx,                      4842075
     silesia,                            multithreaded,                      compress cctx,                      4842075
     silesia,                            multithreaded long distance mode,   compress cctx,                      4842075
    @@ -55,7 +55,7 @@ silesia,                            small hash log,                     compress
     silesia,                            small chain log,                    compress cctx,                      4912197
     silesia,                            explicit params,                    compress cctx,                      4794318
     silesia,                            uncompressed literals,              compress cctx,                      4842075
    -silesia,                            uncompressed literals optimal,      compress cctx,                      4296055
    +silesia,                            uncompressed literals optimal,      compress cctx,                      4266582
     silesia,                            huffman literals,                   compress cctx,                      6172202
     silesia,                            multithreaded with advanced params, compress cctx,                      4842075
     github,                             level -5,                           compress cctx,                      204407
    @@ -83,9 +83,9 @@ github,                             level 9 with dict,                  compress
     github,                             level 13,                           compress cctx,                      132878
     github,                             level 13 with dict,                 compress cctx,                      39948
     github,                             level 16,                           compress cctx,                      133209
    -github,                             level 16 with dict,                 compress cctx,                      37568
    +github,                             level 16 with dict,                 compress cctx,                      37892
     github,                             level 19,                           compress cctx,                      132879
    -github,                             level 19 with dict,                 compress cctx,                      37567
    +github,                             level 19 with dict,                 compress cctx,                      37906
     github,                             long distance mode,                 compress cctx,                      141069
     github,                             multithreaded,                      compress cctx,                      141069
     github,                             multithreaded long distance mode,   compress cctx,                      141069
    @@ -109,8 +109,8 @@ silesia,                            level 6,                            zstdcli,
     silesia,                            level 7,                            zstdcli,                            4570319
     silesia,                            level 9,                            zstdcli,                            4545898
     silesia,                            level 13,                           zstdcli,                            4494038
    -silesia,                            level 16,                           zstdcli,                            4360089
    -silesia,                            level 19,                           zstdcli,                            4296103
    +silesia,                            level 16,                           zstdcli,                            4359700
    +silesia,                            level 19,                           zstdcli,                            4266630
     silesia,                            long distance mode,                 zstdcli,                            4833785
     silesia,                            multithreaded,                      zstdcli,                            4842123
     silesia,                            multithreaded long distance mode,   zstdcli,                            4833785
    @@ -119,7 +119,7 @@ silesia,                            small hash log,                     zstdcli,
     silesia,                            small chain log,                    zstdcli,                            4912245
     silesia,                            explicit params,                    zstdcli,                            4795840
     silesia,                            uncompressed literals,              zstdcli,                            5120614
    -silesia,                            uncompressed literals optimal,      zstdcli,                            4319566
    +silesia,                            uncompressed literals optimal,      zstdcli,                            4316928
     silesia,                            huffman literals,                   zstdcli,                            5321417
     silesia,                            multithreaded with advanced params, zstdcli,                            5120614
     silesia.tar,                        level -5,                           zstdcli,                            6862049
    @@ -134,8 +134,8 @@ silesia.tar,                        level 6,                            zstdcli,
     silesia.tar,                        level 7,                            zstdcli,                            4581791
     silesia.tar,                        level 9,                            zstdcli,                            4555452
     silesia.tar,                        level 13,                           zstdcli,                            4502960
    -silesia.tar,                        level 16,                           zstdcli,                            4360550
    -silesia.tar,                        level 19,                           zstdcli,                            4265915
    +silesia.tar,                        level 16,                           zstdcli,                            4360389
    +silesia.tar,                        level 19,                           zstdcli,                            4260943
     silesia.tar,                        no source size,                     zstdcli,                            4854160
     silesia.tar,                        long distance mode,                 zstdcli,                            4845745
     silesia.tar,                        multithreaded,                      zstdcli,                            4854164
    @@ -145,7 +145,7 @@ silesia.tar,                        small hash log,                     zstdcli,
     silesia.tar,                        small chain log,                    zstdcli,                            4917022
     silesia.tar,                        explicit params,                    zstdcli,                            4821112
     silesia.tar,                        uncompressed literals,              zstdcli,                            5122571
    -silesia.tar,                        uncompressed literals optimal,      zstdcli,                            4310145
    +silesia.tar,                        uncompressed literals optimal,      zstdcli,                            4308455
     silesia.tar,                        huffman literals,                   zstdcli,                            5342074
     silesia.tar,                        multithreaded with advanced params, zstdcli,                            5122571
     github,                             level -5,                           zstdcli,                            206407
    @@ -173,9 +173,9 @@ github,                             level 9 with dict,                  zstdcli,
     github,                             level 13,                           zstdcli,                            134878
     github,                             level 13 with dict,                 zstdcli,                            41900
     github,                             level 16,                           zstdcli,                            135209
    -github,                             level 16 with dict,                 zstdcli,                            39577
    +github,                             level 16 with dict,                 zstdcli,                            39902
     github,                             level 19,                           zstdcli,                            134879
    -github,                             level 19 with dict,                 zstdcli,                            39576
    +github,                             level 19 with dict,                 zstdcli,                            39916
     github,                             long distance mode,                 zstdcli,                            138332
     github,                             multithreaded,                      zstdcli,                            138332
     github,                             multithreaded long distance mode,   zstdcli,                            138332
    @@ -212,9 +212,9 @@ github.tar,                         level 9 with dict,                  zstdcli,
     github.tar,                         level 13,                           zstdcli,                            35505
     github.tar,                         level 13 with dict,                 zstdcli,                            37134
     github.tar,                         level 16,                           zstdcli,                            40470
    -github.tar,                         level 16 with dict,                 zstdcli,                            33378
    -github.tar,                         level 19,                           zstdcli,                            32280
    -github.tar,                         level 19 with dict,                 zstdcli,                            32716
    +github.tar,                         level 16 with dict,                 zstdcli,                            33379
    +github.tar,                         level 19,                           zstdcli,                            32266
    +github.tar,                         level 19 with dict,                 zstdcli,                            32705
     github.tar,                         no source size,                     zstdcli,                            38832
     github.tar,                         no source size with dict,           zstdcli,                            38004
     github.tar,                         long distance mode,                 zstdcli,                            40236
    @@ -225,7 +225,7 @@ github.tar,                         small hash log,                     zstdcli,
     github.tar,                         small chain log,                    zstdcli,                            41673
     github.tar,                         explicit params,                    zstdcli,                            41385
     github.tar,                         uncompressed literals,              zstdcli,                            41529
    -github.tar,                         uncompressed literals optimal,      zstdcli,                            35401
    +github.tar,                         uncompressed literals optimal,      zstdcli,                            35360
     github.tar,                         huffman literals,                   zstdcli,                            38857
     github.tar,                         multithreaded with advanced params, zstdcli,                            41529
     silesia,                            level -5,                           advanced one pass,                  6857372
    @@ -248,8 +248,8 @@ silesia,                            level 11 row 2,                     advanced
     silesia,                            level 12 row 1,                     advanced one pass,                  4505658
     silesia,                            level 12 row 2,                     advanced one pass,                  4503429
     silesia,                            level 13,                           advanced one pass,                  4493990
    -silesia,                            level 16,                           advanced one pass,                  4360041
    -silesia,                            level 19,                           advanced one pass,                  4296055
    +silesia,                            level 16,                           advanced one pass,                  4359652
    +silesia,                            level 19,                           advanced one pass,                  4266582
     silesia,                            no source size,                     advanced one pass,                  4842075
     silesia,                            long distance mode,                 advanced one pass,                  4833710
     silesia,                            multithreaded,                      advanced one pass,                  4842075
    @@ -259,7 +259,7 @@ silesia,                            small hash log,                     advanced
     silesia,                            small chain log,                    advanced one pass,                  4912197
     silesia,                            explicit params,                    advanced one pass,                  4795840
     silesia,                            uncompressed literals,              advanced one pass,                  5120566
    -silesia,                            uncompressed literals optimal,      advanced one pass,                  4319518
    +silesia,                            uncompressed literals optimal,      advanced one pass,                  4316880
     silesia,                            huffman literals,                   advanced one pass,                  5321369
     silesia,                            multithreaded with advanced params, advanced one pass,                  5120566
     silesia.tar,                        level -5,                           advanced one pass,                  6861055
    @@ -282,8 +282,8 @@ silesia.tar,                        level 11 row 2,                     advanced
     silesia.tar,                        level 12 row 1,                     advanced one pass,                  4514517
     silesia.tar,                        level 12 row 2,                     advanced one pass,                  4514007
     silesia.tar,                        level 13,                           advanced one pass,                  4502956
    -silesia.tar,                        level 16,                           advanced one pass,                  4360546
    -silesia.tar,                        level 19,                           advanced one pass,                  4265911
    +silesia.tar,                        level 16,                           advanced one pass,                  4360385
    +silesia.tar,                        level 19,                           advanced one pass,                  4260939
     silesia.tar,                        no source size,                     advanced one pass,                  4854086
     silesia.tar,                        long distance mode,                 advanced one pass,                  4840452
     silesia.tar,                        multithreaded,                      advanced one pass,                  4854160
    @@ -293,7 +293,7 @@ silesia.tar,                        small hash log,                     advanced
     silesia.tar,                        small chain log,                    advanced one pass,                  4917041
     silesia.tar,                        explicit params,                    advanced one pass,                  4807274
     silesia.tar,                        uncompressed literals,              advanced one pass,                  5122473
    -silesia.tar,                        uncompressed literals optimal,      advanced one pass,                  4310141
    +silesia.tar,                        uncompressed literals optimal,      advanced one pass,                  4308451
     silesia.tar,                        huffman literals,                   advanced one pass,                  5341705
     silesia.tar,                        multithreaded with advanced params, advanced one pass,                  5122567
     github,                             level -5,                           advanced one pass,                  204407
    @@ -397,17 +397,17 @@ github,                             level 13 with dict dds,             advanced
     github,                             level 13 with dict copy,            advanced one pass,                  39948
     github,                             level 13 with dict load,            advanced one pass,                  42624
     github,                             level 16,                           advanced one pass,                  133209
    -github,                             level 16 with dict,                 advanced one pass,                  37577
    -github,                             level 16 with dict dms,             advanced one pass,                  37577
    -github,                             level 16 with dict dds,             advanced one pass,                  37577
    -github,                             level 16 with dict copy,            advanced one pass,                  37568
    -github,                             level 16 with dict load,            advanced one pass,                  42338
    +github,                             level 16 with dict,                 advanced one pass,                  37902
    +github,                             level 16 with dict dms,             advanced one pass,                  37902
    +github,                             level 16 with dict dds,             advanced one pass,                  37902
    +github,                             level 16 with dict copy,            advanced one pass,                  37892
    +github,                             level 16 with dict load,            advanced one pass,                  42402
     github,                             level 19,                           advanced one pass,                  132879
    -github,                             level 19 with dict,                 advanced one pass,                  37576
    -github,                             level 19 with dict dms,             advanced one pass,                  37576
    -github,                             level 19 with dict dds,             advanced one pass,                  37576
    -github,                             level 19 with dict copy,            advanced one pass,                  37567
    -github,                             level 19 with dict load,            advanced one pass,                  39613
    +github,                             level 19 with dict,                 advanced one pass,                  37916
    +github,                             level 19 with dict dms,             advanced one pass,                  37916
    +github,                             level 19 with dict dds,             advanced one pass,                  37916
    +github,                             level 19 with dict copy,            advanced one pass,                  37906
    +github,                             level 19 with dict load,            advanced one pass,                  39770
     github,                             no source size,                     advanced one pass,                  136332
     github,                             no source size with dict,           advanced one pass,                  41148
     github,                             long distance mode,                 advanced one pass,                  136332
    @@ -522,17 +522,17 @@ github.tar,                         level 13 with dict dds,             advanced
     github.tar,                         level 13 with dict copy,            advanced one pass,                  37130
     github.tar,                         level 13 with dict load,            advanced one pass,                  36010
     github.tar,                         level 16,                           advanced one pass,                  40466
    -github.tar,                         level 16 with dict,                 advanced one pass,                  33374
    -github.tar,                         level 16 with dict dms,             advanced one pass,                  33206
    -github.tar,                         level 16 with dict dds,             advanced one pass,                  33206
    -github.tar,                         level 16 with dict copy,            advanced one pass,                  33374
    +github.tar,                         level 16 with dict,                 advanced one pass,                  33375
    +github.tar,                         level 16 with dict dms,             advanced one pass,                  33207
    +github.tar,                         level 16 with dict dds,             advanced one pass,                  33207
    +github.tar,                         level 16 with dict copy,            advanced one pass,                  33375
     github.tar,                         level 16 with dict load,            advanced one pass,                  39081
    -github.tar,                         level 19,                           advanced one pass,                  32276
    -github.tar,                         level 19 with dict,                 advanced one pass,                  32712
    -github.tar,                         level 19 with dict dms,             advanced one pass,                  32555
    -github.tar,                         level 19 with dict dds,             advanced one pass,                  32555
    -github.tar,                         level 19 with dict copy,            advanced one pass,                  32712
    -github.tar,                         level 19 with dict load,            advanced one pass,                  32479
    +github.tar,                         level 19,                           advanced one pass,                  32262
    +github.tar,                         level 19 with dict,                 advanced one pass,                  32701
    +github.tar,                         level 19 with dict dms,             advanced one pass,                  32565
    +github.tar,                         level 19 with dict dds,             advanced one pass,                  32565
    +github.tar,                         level 19 with dict copy,            advanced one pass,                  32701
    +github.tar,                         level 19 with dict load,            advanced one pass,                  32428
     github.tar,                         no source size,                     advanced one pass,                  38831
     github.tar,                         no source size with dict,           advanced one pass,                  37995
     github.tar,                         long distance mode,                 advanced one pass,                  40252
    @@ -543,7 +543,7 @@ github.tar,                         small hash log,                     advanced
     github.tar,                         small chain log,                    advanced one pass,                  41669
     github.tar,                         explicit params,                    advanced one pass,                  41385
     github.tar,                         uncompressed literals,              advanced one pass,                  41525
    -github.tar,                         uncompressed literals optimal,      advanced one pass,                  35397
    +github.tar,                         uncompressed literals optimal,      advanced one pass,                  35356
     github.tar,                         huffman literals,                   advanced one pass,                  38853
     github.tar,                         multithreaded with advanced params, advanced one pass,                  41525
     silesia,                            level -5,                           advanced one pass small out,        6857372
    @@ -566,8 +566,8 @@ silesia,                            level 11 row 2,                     advanced
     silesia,                            level 12 row 1,                     advanced one pass small out,        4505658
     silesia,                            level 12 row 2,                     advanced one pass small out,        4503429
     silesia,                            level 13,                           advanced one pass small out,        4493990
    -silesia,                            level 16,                           advanced one pass small out,        4360041
    -silesia,                            level 19,                           advanced one pass small out,        4296055
    +silesia,                            level 16,                           advanced one pass small out,        4359652
    +silesia,                            level 19,                           advanced one pass small out,        4266582
     silesia,                            no source size,                     advanced one pass small out,        4842075
     silesia,                            long distance mode,                 advanced one pass small out,        4833710
     silesia,                            multithreaded,                      advanced one pass small out,        4842075
    @@ -577,7 +577,7 @@ silesia,                            small hash log,                     advanced
     silesia,                            small chain log,                    advanced one pass small out,        4912197
     silesia,                            explicit params,                    advanced one pass small out,        4795840
     silesia,                            uncompressed literals,              advanced one pass small out,        5120566
    -silesia,                            uncompressed literals optimal,      advanced one pass small out,        4319518
    +silesia,                            uncompressed literals optimal,      advanced one pass small out,        4316880
     silesia,                            huffman literals,                   advanced one pass small out,        5321369
     silesia,                            multithreaded with advanced params, advanced one pass small out,        5120566
     silesia.tar,                        level -5,                           advanced one pass small out,        6861055
    @@ -600,8 +600,8 @@ silesia.tar,                        level 11 row 2,                     advanced
     silesia.tar,                        level 12 row 1,                     advanced one pass small out,        4514517
     silesia.tar,                        level 12 row 2,                     advanced one pass small out,        4514007
     silesia.tar,                        level 13,                           advanced one pass small out,        4502956
    -silesia.tar,                        level 16,                           advanced one pass small out,        4360546
    -silesia.tar,                        level 19,                           advanced one pass small out,        4265911
    +silesia.tar,                        level 16,                           advanced one pass small out,        4360385
    +silesia.tar,                        level 19,                           advanced one pass small out,        4260939
     silesia.tar,                        no source size,                     advanced one pass small out,        4854086
     silesia.tar,                        long distance mode,                 advanced one pass small out,        4840452
     silesia.tar,                        multithreaded,                      advanced one pass small out,        4854160
    @@ -611,7 +611,7 @@ silesia.tar,                        small hash log,                     advanced
     silesia.tar,                        small chain log,                    advanced one pass small out,        4917041
     silesia.tar,                        explicit params,                    advanced one pass small out,        4807274
     silesia.tar,                        uncompressed literals,              advanced one pass small out,        5122473
    -silesia.tar,                        uncompressed literals optimal,      advanced one pass small out,        4310141
    +silesia.tar,                        uncompressed literals optimal,      advanced one pass small out,        4308451
     silesia.tar,                        huffman literals,                   advanced one pass small out,        5341705
     silesia.tar,                        multithreaded with advanced params, advanced one pass small out,        5122567
     github,                             level -5,                           advanced one pass small out,        204407
    @@ -715,17 +715,17 @@ github,                             level 13 with dict dds,             advanced
     github,                             level 13 with dict copy,            advanced one pass small out,        39948
     github,                             level 13 with dict load,            advanced one pass small out,        42624
     github,                             level 16,                           advanced one pass small out,        133209
    -github,                             level 16 with dict,                 advanced one pass small out,        37577
    -github,                             level 16 with dict dms,             advanced one pass small out,        37577
    -github,                             level 16 with dict dds,             advanced one pass small out,        37577
    -github,                             level 16 with dict copy,            advanced one pass small out,        37568
    -github,                             level 16 with dict load,            advanced one pass small out,        42338
    +github,                             level 16 with dict,                 advanced one pass small out,        37902
    +github,                             level 16 with dict dms,             advanced one pass small out,        37902
    +github,                             level 16 with dict dds,             advanced one pass small out,        37902
    +github,                             level 16 with dict copy,            advanced one pass small out,        37892
    +github,                             level 16 with dict load,            advanced one pass small out,        42402
     github,                             level 19,                           advanced one pass small out,        132879
    -github,                             level 19 with dict,                 advanced one pass small out,        37576
    -github,                             level 19 with dict dms,             advanced one pass small out,        37576
    -github,                             level 19 with dict dds,             advanced one pass small out,        37576
    -github,                             level 19 with dict copy,            advanced one pass small out,        37567
    -github,                             level 19 with dict load,            advanced one pass small out,        39613
    +github,                             level 19 with dict,                 advanced one pass small out,        37916
    +github,                             level 19 with dict dms,             advanced one pass small out,        37916
    +github,                             level 19 with dict dds,             advanced one pass small out,        37916
    +github,                             level 19 with dict copy,            advanced one pass small out,        37906
    +github,                             level 19 with dict load,            advanced one pass small out,        39770
     github,                             no source size,                     advanced one pass small out,        136332
     github,                             no source size with dict,           advanced one pass small out,        41148
     github,                             long distance mode,                 advanced one pass small out,        136332
    @@ -840,17 +840,17 @@ github.tar,                         level 13 with dict dds,             advanced
     github.tar,                         level 13 with dict copy,            advanced one pass small out,        37130
     github.tar,                         level 13 with dict load,            advanced one pass small out,        36010
     github.tar,                         level 16,                           advanced one pass small out,        40466
    -github.tar,                         level 16 with dict,                 advanced one pass small out,        33374
    -github.tar,                         level 16 with dict dms,             advanced one pass small out,        33206
    -github.tar,                         level 16 with dict dds,             advanced one pass small out,        33206
    -github.tar,                         level 16 with dict copy,            advanced one pass small out,        33374
    +github.tar,                         level 16 with dict,                 advanced one pass small out,        33375
    +github.tar,                         level 16 with dict dms,             advanced one pass small out,        33207
    +github.tar,                         level 16 with dict dds,             advanced one pass small out,        33207
    +github.tar,                         level 16 with dict copy,            advanced one pass small out,        33375
     github.tar,                         level 16 with dict load,            advanced one pass small out,        39081
    -github.tar,                         level 19,                           advanced one pass small out,        32276
    -github.tar,                         level 19 with dict,                 advanced one pass small out,        32712
    -github.tar,                         level 19 with dict dms,             advanced one pass small out,        32555
    -github.tar,                         level 19 with dict dds,             advanced one pass small out,        32555
    -github.tar,                         level 19 with dict copy,            advanced one pass small out,        32712
    -github.tar,                         level 19 with dict load,            advanced one pass small out,        32479
    +github.tar,                         level 19,                           advanced one pass small out,        32262
    +github.tar,                         level 19 with dict,                 advanced one pass small out,        32701
    +github.tar,                         level 19 with dict dms,             advanced one pass small out,        32565
    +github.tar,                         level 19 with dict dds,             advanced one pass small out,        32565
    +github.tar,                         level 19 with dict copy,            advanced one pass small out,        32701
    +github.tar,                         level 19 with dict load,            advanced one pass small out,        32428
     github.tar,                         no source size,                     advanced one pass small out,        38831
     github.tar,                         no source size with dict,           advanced one pass small out,        37995
     github.tar,                         long distance mode,                 advanced one pass small out,        40252
    @@ -861,7 +861,7 @@ github.tar,                         small hash log,                     advanced
     github.tar,                         small chain log,                    advanced one pass small out,        41669
     github.tar,                         explicit params,                    advanced one pass small out,        41385
     github.tar,                         uncompressed literals,              advanced one pass small out,        41525
    -github.tar,                         uncompressed literals optimal,      advanced one pass small out,        35397
    +github.tar,                         uncompressed literals optimal,      advanced one pass small out,        35356
     github.tar,                         huffman literals,                   advanced one pass small out,        38853
     github.tar,                         multithreaded with advanced params, advanced one pass small out,        41525
     silesia,                            level -5,                           advanced streaming,                 6854744
    @@ -884,8 +884,8 @@ silesia,                            level 11 row 2,                     advanced
     silesia,                            level 12 row 1,                     advanced streaming,                 4505658
     silesia,                            level 12 row 2,                     advanced streaming,                 4503429
     silesia,                            level 13,                           advanced streaming,                 4493990
    -silesia,                            level 16,                           advanced streaming,                 4360041
    -silesia,                            level 19,                           advanced streaming,                 4296055
    +silesia,                            level 16,                           advanced streaming,                 4359652
    +silesia,                            level 19,                           advanced streaming,                 4266582
     silesia,                            no source size,                     advanced streaming,                 4842039
     silesia,                            long distance mode,                 advanced streaming,                 4833710
     silesia,                            multithreaded,                      advanced streaming,                 4842075
    @@ -895,7 +895,7 @@ silesia,                            small hash log,                     advanced
     silesia,                            small chain log,                    advanced streaming,                 4912197
     silesia,                            explicit params,                    advanced streaming,                 4795857
     silesia,                            uncompressed literals,              advanced streaming,                 5120566
    -silesia,                            uncompressed literals optimal,      advanced streaming,                 4319518
    +silesia,                            uncompressed literals optimal,      advanced streaming,                 4316880
     silesia,                            huffman literals,                   advanced streaming,                 5321370
     silesia,                            multithreaded with advanced params, advanced streaming,                 5120566
     silesia.tar,                        level -5,                           advanced streaming,                 6856523
    @@ -918,8 +918,8 @@ silesia.tar,                        level 11 row 2,                     advanced
     silesia.tar,                        level 12 row 1,                     advanced streaming,                 4514514
     silesia.tar,                        level 12 row 2,                     advanced streaming,                 4514003
     silesia.tar,                        level 13,                           advanced streaming,                 4502956
    -silesia.tar,                        level 16,                           advanced streaming,                 4360546
    -silesia.tar,                        level 19,                           advanced streaming,                 4265911
    +silesia.tar,                        level 16,                           advanced streaming,                 4360385
    +silesia.tar,                        level 19,                           advanced streaming,                 4260939
     silesia.tar,                        no source size,                     advanced streaming,                 4859267
     silesia.tar,                        long distance mode,                 advanced streaming,                 4840452
     silesia.tar,                        multithreaded,                      advanced streaming,                 4854160
    @@ -929,7 +929,7 @@ silesia.tar,                        small hash log,                     advanced
     silesia.tar,                        small chain log,                    advanced streaming,                 4917021
     silesia.tar,                        explicit params,                    advanced streaming,                 4807288
     silesia.tar,                        uncompressed literals,              advanced streaming,                 5127423
    -silesia.tar,                        uncompressed literals optimal,      advanced streaming,                 4310141
    +silesia.tar,                        uncompressed literals optimal,      advanced streaming,                 4308451
     silesia.tar,                        huffman literals,                   advanced streaming,                 5341712
     silesia.tar,                        multithreaded with advanced params, advanced streaming,                 5122567
     github,                             level -5,                           advanced streaming,                 204407
    @@ -1033,17 +1033,17 @@ github,                             level 13 with dict dds,             advanced
     github,                             level 13 with dict copy,            advanced streaming,                 39948
     github,                             level 13 with dict load,            advanced streaming,                 42624
     github,                             level 16,                           advanced streaming,                 133209
    -github,                             level 16 with dict,                 advanced streaming,                 37577
    -github,                             level 16 with dict dms,             advanced streaming,                 37577
    -github,                             level 16 with dict dds,             advanced streaming,                 37577
    -github,                             level 16 with dict copy,            advanced streaming,                 37568
    -github,                             level 16 with dict load,            advanced streaming,                 42338
    +github,                             level 16 with dict,                 advanced streaming,                 37902
    +github,                             level 16 with dict dms,             advanced streaming,                 37902
    +github,                             level 16 with dict dds,             advanced streaming,                 37902
    +github,                             level 16 with dict copy,            advanced streaming,                 37892
    +github,                             level 16 with dict load,            advanced streaming,                 42402
     github,                             level 19,                           advanced streaming,                 132879
    -github,                             level 19 with dict,                 advanced streaming,                 37576
    -github,                             level 19 with dict dms,             advanced streaming,                 37576
    -github,                             level 19 with dict dds,             advanced streaming,                 37576
    -github,                             level 19 with dict copy,            advanced streaming,                 37567
    -github,                             level 19 with dict load,            advanced streaming,                 39613
    +github,                             level 19 with dict,                 advanced streaming,                 37916
    +github,                             level 19 with dict dms,             advanced streaming,                 37916
    +github,                             level 19 with dict dds,             advanced streaming,                 37916
    +github,                             level 19 with dict copy,            advanced streaming,                 37906
    +github,                             level 19 with dict load,            advanced streaming,                 39770
     github,                             no source size,                     advanced streaming,                 136332
     github,                             no source size with dict,           advanced streaming,                 41148
     github,                             long distance mode,                 advanced streaming,                 136332
    @@ -1158,17 +1158,17 @@ github.tar,                         level 13 with dict dds,             advanced
     github.tar,                         level 13 with dict copy,            advanced streaming,                 37130
     github.tar,                         level 13 with dict load,            advanced streaming,                 36010
     github.tar,                         level 16,                           advanced streaming,                 40466
    -github.tar,                         level 16 with dict,                 advanced streaming,                 33374
    -github.tar,                         level 16 with dict dms,             advanced streaming,                 33206
    -github.tar,                         level 16 with dict dds,             advanced streaming,                 33206
    -github.tar,                         level 16 with dict copy,            advanced streaming,                 33374
    +github.tar,                         level 16 with dict,                 advanced streaming,                 33375
    +github.tar,                         level 16 with dict dms,             advanced streaming,                 33207
    +github.tar,                         level 16 with dict dds,             advanced streaming,                 33207
    +github.tar,                         level 16 with dict copy,            advanced streaming,                 33375
     github.tar,                         level 16 with dict load,            advanced streaming,                 39081
    -github.tar,                         level 19,                           advanced streaming,                 32276
    -github.tar,                         level 19 with dict,                 advanced streaming,                 32712
    -github.tar,                         level 19 with dict dms,             advanced streaming,                 32555
    -github.tar,                         level 19 with dict dds,             advanced streaming,                 32555
    -github.tar,                         level 19 with dict copy,            advanced streaming,                 32712
    -github.tar,                         level 19 with dict load,            advanced streaming,                 32479
    +github.tar,                         level 19,                           advanced streaming,                 32262
    +github.tar,                         level 19 with dict,                 advanced streaming,                 32701
    +github.tar,                         level 19 with dict dms,             advanced streaming,                 32565
    +github.tar,                         level 19 with dict dds,             advanced streaming,                 32565
    +github.tar,                         level 19 with dict copy,            advanced streaming,                 32701
    +github.tar,                         level 19 with dict load,            advanced streaming,                 32428
     github.tar,                         no source size,                     advanced streaming,                 38828
     github.tar,                         no source size with dict,           advanced streaming,                 38000
     github.tar,                         long distance mode,                 advanced streaming,                 40252
    @@ -1179,7 +1179,7 @@ github.tar,                         small hash log,                     advanced
     github.tar,                         small chain log,                    advanced streaming,                 41669
     github.tar,                         explicit params,                    advanced streaming,                 41385
     github.tar,                         uncompressed literals,              advanced streaming,                 41525
    -github.tar,                         uncompressed literals optimal,      advanced streaming,                 35397
    +github.tar,                         uncompressed literals optimal,      advanced streaming,                 35356
     github.tar,                         huffman literals,                   advanced streaming,                 38853
     github.tar,                         multithreaded with advanced params, advanced streaming,                 41525
     silesia,                            level -5,                           old streaming,                      6854744
    @@ -1194,11 +1194,11 @@ silesia,                            level 6,                            old stre
     silesia,                            level 7,                            old streaming,                      4570271
     silesia,                            level 9,                            old streaming,                      4545850
     silesia,                            level 13,                           old streaming,                      4493990
    -silesia,                            level 16,                           old streaming,                      4360041
    -silesia,                            level 19,                           old streaming,                      4296055
    +silesia,                            level 16,                           old streaming,                      4359652
    +silesia,                            level 19,                           old streaming,                      4266582
     silesia,                            no source size,                     old streaming,                      4842039
     silesia,                            uncompressed literals,              old streaming,                      4842075
    -silesia,                            uncompressed literals optimal,      old streaming,                      4296055
    +silesia,                            uncompressed literals optimal,      old streaming,                      4266582
     silesia,                            huffman literals,                   old streaming,                      6172207
     silesia.tar,                        level -5,                           old streaming,                      6856523
     silesia.tar,                        level -3,                           old streaming,                      6505954
    @@ -1212,11 +1212,11 @@ silesia.tar,                        level 6,                            old stre
     silesia.tar,                        level 7,                            old streaming,                      4579823
     silesia.tar,                        level 9,                            old streaming,                      4555445
     silesia.tar,                        level 13,                           old streaming,                      4502956
    -silesia.tar,                        level 16,                           old streaming,                      4360546
    -silesia.tar,                        level 19,                           old streaming,                      4265911
    +silesia.tar,                        level 16,                           old streaming,                      4360385
    +silesia.tar,                        level 19,                           old streaming,                      4260939
     silesia.tar,                        no source size,                     old streaming,                      4859267
     silesia.tar,                        uncompressed literals,              old streaming,                      4859271
    -silesia.tar,                        uncompressed literals optimal,      old streaming,                      4265911
    +silesia.tar,                        uncompressed literals optimal,      old streaming,                      4260939
     silesia.tar,                        huffman literals,                   old streaming,                      6179056
     github,                             level -5,                           old streaming,                      204407
     github,                             level -5 with dict,                 old streaming,                      45832
    @@ -1243,9 +1243,9 @@ github,                             level 9 with dict,                  old stre
     github,                             level 13,                           old streaming,                      132878
     github,                             level 13 with dict,                 old streaming,                      39900
     github,                             level 16,                           old streaming,                      133209
    -github,                             level 16 with dict,                 old streaming,                      37577
    +github,                             level 16 with dict,                 old streaming,                      37902
     github,                             level 19,                           old streaming,                      132879
    -github,                             level 19 with dict,                 old streaming,                      37576
    +github,                             level 19 with dict,                 old streaming,                      37916
     github,                             no source size,                     old streaming,                      140599
     github,                             no source size with dict,           old streaming,                      40654
     github,                             uncompressed literals,              old streaming,                      136332
    @@ -1276,13 +1276,13 @@ github.tar,                         level 9 with dict,                  old stre
     github.tar,                         level 13,                           old streaming,                      35501
     github.tar,                         level 13 with dict,                 old streaming,                      37130
     github.tar,                         level 16,                           old streaming,                      40466
    -github.tar,                         level 16 with dict,                 old streaming,                      33374
    -github.tar,                         level 19,                           old streaming,                      32276
    -github.tar,                         level 19 with dict,                 old streaming,                      32712
    +github.tar,                         level 16 with dict,                 old streaming,                      33375
    +github.tar,                         level 19,                           old streaming,                      32262
    +github.tar,                         level 19 with dict,                 old streaming,                      32701
     github.tar,                         no source size,                     old streaming,                      38828
     github.tar,                         no source size with dict,           old streaming,                      38000
     github.tar,                         uncompressed literals,              old streaming,                      38831
    -github.tar,                         uncompressed literals optimal,      old streaming,                      32276
    +github.tar,                         uncompressed literals optimal,      old streaming,                      32262
     github.tar,                         huffman literals,                   old streaming,                      42560
     silesia,                            level -5,                           old streaming advanced,             6854744
     silesia,                            level -3,                           old streaming advanced,             6503319
    @@ -1296,8 +1296,8 @@ silesia,                            level 6,                            old stre
     silesia,                            level 7,                            old streaming advanced,             4570271
     silesia,                            level 9,                            old streaming advanced,             4545850
     silesia,                            level 13,                           old streaming advanced,             4493990
    -silesia,                            level 16,                           old streaming advanced,             4360041
    -silesia,                            level 19,                           old streaming advanced,             4296055
    +silesia,                            level 16,                           old streaming advanced,             4359652
    +silesia,                            level 19,                           old streaming advanced,             4266582
     silesia,                            no source size,                     old streaming advanced,             4842039
     silesia,                            long distance mode,                 old streaming advanced,             4842075
     silesia,                            multithreaded,                      old streaming advanced,             4842075
    @@ -1307,7 +1307,7 @@ silesia,                            small hash log,                     old stre
     silesia,                            small chain log,                    old streaming advanced,             4912197
     silesia,                            explicit params,                    old streaming advanced,             4795857
     silesia,                            uncompressed literals,              old streaming advanced,             4842075
    -silesia,                            uncompressed literals optimal,      old streaming advanced,             4296055
    +silesia,                            uncompressed literals optimal,      old streaming advanced,             4266582
     silesia,                            huffman literals,                   old streaming advanced,             6172207
     silesia,                            multithreaded with advanced params, old streaming advanced,             4842075
     silesia.tar,                        level -5,                           old streaming advanced,             6856523
    @@ -1322,8 +1322,8 @@ silesia.tar,                        level 6,                            old stre
     silesia.tar,                        level 7,                            old streaming advanced,             4579823
     silesia.tar,                        level 9,                            old streaming advanced,             4555445
     silesia.tar,                        level 13,                           old streaming advanced,             4502956
    -silesia.tar,                        level 16,                           old streaming advanced,             4360546
    -silesia.tar,                        level 19,                           old streaming advanced,             4265911
    +silesia.tar,                        level 16,                           old streaming advanced,             4360385
    +silesia.tar,                        level 19,                           old streaming advanced,             4260939
     silesia.tar,                        no source size,                     old streaming advanced,             4859267
     silesia.tar,                        long distance mode,                 old streaming advanced,             4859271
     silesia.tar,                        multithreaded,                      old streaming advanced,             4859271
    @@ -1333,7 +1333,7 @@ silesia.tar,                        small hash log,                     old stre
     silesia.tar,                        small chain log,                    old streaming advanced,             4917021
     silesia.tar,                        explicit params,                    old streaming advanced,             4807288
     silesia.tar,                        uncompressed literals,              old streaming advanced,             4859271
    -silesia.tar,                        uncompressed literals optimal,      old streaming advanced,             4265911
    +silesia.tar,                        uncompressed literals optimal,      old streaming advanced,             4260939
     silesia.tar,                        huffman literals,                   old streaming advanced,             6179056
     silesia.tar,                        multithreaded with advanced params, old streaming advanced,             4859271
     github,                             level -5,                           old streaming advanced,             213265
    @@ -1361,9 +1361,9 @@ github,                             level 9 with dict,                  old stre
     github,                             level 13,                           old streaming advanced,             138676
     github,                             level 13 with dict,                 old streaming advanced,             39725
     github,                             level 16,                           old streaming advanced,             138575
    -github,                             level 16 with dict,                 old streaming advanced,             40789
    +github,                             level 16 with dict,                 old streaming advanced,             40804
     github,                             level 19,                           old streaming advanced,             132879
    -github,                             level 19 with dict,                 old streaming advanced,             37576
    +github,                             level 19 with dict,                 old streaming advanced,             37916
     github,                             no source size,                     old streaming advanced,             140599
     github,                             no source size with dict,           old streaming advanced,             40608
     github,                             long distance mode,                 old streaming advanced,             141104
    @@ -1403,8 +1403,8 @@ github.tar,                         level 13,                           old stre
     github.tar,                         level 13 with dict,                 old streaming advanced,             35807
     github.tar,                         level 16,                           old streaming advanced,             40466
     github.tar,                         level 16 with dict,                 old streaming advanced,             38578
    -github.tar,                         level 19,                           old streaming advanced,             32276
    -github.tar,                         level 19 with dict,                 old streaming advanced,             32704
    +github.tar,                         level 19,                           old streaming advanced,             32262
    +github.tar,                         level 19 with dict,                 old streaming advanced,             32678
     github.tar,                         no source size,                     old streaming advanced,             38828
     github.tar,                         no source size with dict,           old streaming advanced,             38015
     github.tar,                         long distance mode,                 old streaming advanced,             38831
    @@ -1415,7 +1415,7 @@ github.tar,                         small hash log,                     old stre
     github.tar,                         small chain log,                    old streaming advanced,             41669
     github.tar,                         explicit params,                    old streaming advanced,             41385
     github.tar,                         uncompressed literals,              old streaming advanced,             38831
    -github.tar,                         uncompressed literals optimal,      old streaming advanced,             32276
    +github.tar,                         uncompressed literals optimal,      old streaming advanced,             32262
     github.tar,                         huffman literals,                   old streaming advanced,             42560
     github.tar,                         multithreaded with advanced params, old streaming advanced,             38831
     github,                             level -5 with dict,                 old streaming cdict,                45832
    @@ -1430,8 +1430,8 @@ github,                             level 6 with dict,                  old stre
     github,                             level 7 with dict,                  old streaming cdict,                38765
     github,                             level 9 with dict,                  old streaming cdict,                39439
     github,                             level 13 with dict,                 old streaming cdict,                39900
    -github,                             level 16 with dict,                 old streaming cdict,                37577
    -github,                             level 19 with dict,                 old streaming cdict,                37576
    +github,                             level 16 with dict,                 old streaming cdict,                37902
    +github,                             level 19 with dict,                 old streaming cdict,                37916
     github,                             no source size with dict,           old streaming cdict,                40654
     github.tar,                         level -5 with dict,                 old streaming cdict,                51286
     github.tar,                         level -3 with dict,                 old streaming cdict,                45147
    @@ -1446,7 +1446,7 @@ github.tar,                         level 7 with dict,                  old stre
     github.tar,                         level 9 with dict,                  old streaming cdict,                36322
     github.tar,                         level 13 with dict,                 old streaming cdict,                36010
     github.tar,                         level 16 with dict,                 old streaming cdict,                39081
    -github.tar,                         level 19 with dict,                 old streaming cdict,                32479
    +github.tar,                         level 19 with dict,                 old streaming cdict,                32428
     github.tar,                         no source size with dict,           old streaming cdict,                38000
     github,                             level -5 with dict,                 old streaming advanced cdict,       46708
     github,                             level -3 with dict,                 old streaming advanced cdict,       45476
    @@ -1460,8 +1460,8 @@ github,                             level 6 with dict,                  old stre
     github,                             level 7 with dict,                  old streaming advanced cdict,       38875
     github,                             level 9 with dict,                  old streaming advanced cdict,       38941
     github,                             level 13 with dict,                 old streaming advanced cdict,       39725
    -github,                             level 16 with dict,                 old streaming advanced cdict,       40789
    -github,                             level 19 with dict,                 old streaming advanced cdict,       37576
    +github,                             level 16 with dict,                 old streaming advanced cdict,       40804
    +github,                             level 19 with dict,                 old streaming advanced cdict,       37916
     github,                             no source size with dict,           old streaming advanced cdict,       40608
     github.tar,                         level -5 with dict,                 old streaming advanced cdict,       50791
     github.tar,                         level -3 with dict,                 old streaming advanced cdict,       44926
    @@ -1476,5 +1476,5 @@ github.tar,                         level 7 with dict,                  old stre
     github.tar,                         level 9 with dict,                  old streaming advanced cdict,       36241
     github.tar,                         level 13 with dict,                 old streaming advanced cdict,       35807
     github.tar,                         level 16 with dict,                 old streaming advanced cdict,       38578
    -github.tar,                         level 19 with dict,                 old streaming advanced cdict,       32704
    +github.tar,                         level 19 with dict,                 old streaming advanced cdict,       32678
     github.tar,                         no source size with dict,           old streaming advanced cdict,       38015
    diff --git a/third-party/zstd/tests/zstreamtest.c b/third-party/zstd/tests/zstreamtest.c
    index 14c4af82..e0ee4c3e 100644
    --- a/third-party/zstd/tests/zstreamtest.c
    +++ b/third-party/zstd/tests/zstreamtest.c
    @@ -408,8 +408,8 @@ static int basicUnitTests(U32 seed, double compressibility, int bigTests)
         if (inBuff.pos != inBuff.size) goto _output_error;   /* should have read the entire frame */
         DISPLAYLEVEL(3, "OK \n");
     
    -    /* Re-use without init */
    -    DISPLAYLEVEL(3, "test%3i : decompress again without init (re-use previous settings): ", testNb++);
    +    /* Reuse without init */
    +    DISPLAYLEVEL(3, "test%3i : decompress again without init (reuse previous settings): ", testNb++);
         outBuff.pos = 0;
         { size_t const remaining = ZSTD_decompressStream(zd, &outBuff, &inBuff2);
           if (remaining != 0) goto _output_error; }  /* should reach end of frame == 0; otherwise, some data left, or an error */
    @@ -653,8 +653,8 @@ static int basicUnitTests(U32 seed, double compressibility, int bigTests)
                 DISPLAYLEVEL(3, "OK (error detected : %s) \n", ZSTD_getErrorName(r));
         }   }
     
    -    /* Compression state re-use scenario */
    -    DISPLAYLEVEL(3, "test%3i : context re-use : ", testNb++);
    +    /* Compression state reuse scenario */
    +    DISPLAYLEVEL(3, "test%3i : context reuse : ", testNb++);
         ZSTD_freeCStream(zc);
         zc = ZSTD_createCStream();
         if (zc==NULL) goto _output_error;   /* memory allocation issue */
    @@ -722,6 +722,67 @@ static int basicUnitTests(U32 seed, double compressibility, int bigTests)
         }
         DISPLAYLEVEL(3, "OK \n");
     
    +    DISPLAYLEVEL(3, "test%3i : maxBlockSize = 2KB : ", testNb++);
    +    {
    +        ZSTD_DCtx* dctx = ZSTD_createDCtx();
    +        size_t singlePassSize, streamingSize, streaming2KSize;
    +
    +        {
    +            ZSTD_CCtx* cctx = ZSTD_createCCtx();
    +            CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1));
    +            CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, 18));
    +            CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0));
    +            CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_maxBlockSize, 2048));
    +            cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, CNBufferSize);
    +            CHECK_Z(cSize);
    +            ZSTD_freeCCtx(cctx);
    +        }
    +
    +        CHECK_Z(ZSTD_decompressDCtx(dctx, decodedBuffer, CNBufferSize, compressedBuffer, cSize));
    +        singlePassSize = ZSTD_sizeof_DCtx(dctx);
    +        CHECK_Z(singlePassSize);
    +
    +        inBuff.src = compressedBuffer;
    +        inBuff.size = cSize;
    +
    +        outBuff.dst = decodedBuffer;
    +        outBuff.size = decodedBufferSize;
    +
    +        CHECK_Z(ZSTD_DCtx_setParameter(dctx, ZSTD_d_maxBlockSize, 2048));
    +        inBuff.pos = 0;
    +        outBuff.pos = 0;
    +        {
    +            size_t const r = ZSTD_decompressStream(dctx, &outBuff, &inBuff);
    +            CHECK_Z(r);
    +            CHECK(r != 0, "Entire frame must be decompressed");
    +        }
    +        streaming2KSize = ZSTD_sizeof_DCtx(dctx);
    +        CHECK_Z(streaming2KSize);
    +        
    +        CHECK_Z(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_and_parameters));
    +        inBuff.pos = 0;
    +        outBuff.pos = 0;
    +        {
    +            size_t const r = ZSTD_decompressStream(dctx, &outBuff, &inBuff);
    +            CHECK_Z(r);
    +            CHECK(r != 0, "Entire frame must be decompressed");
    +        }
    +        streamingSize = ZSTD_sizeof_DCtx(dctx);
    +        CHECK_Z(streamingSize);
    +        
    +        CHECK_Z(ZSTD_DCtx_setParameter(dctx, ZSTD_d_maxBlockSize, 1024));
    +        inBuff.pos = 0;
    +        outBuff.pos = 0;
    +        CHECK(!ZSTD_isError(ZSTD_decompressStream(dctx, &outBuff, &inBuff)), "decompression must fail");
    +
    +        CHECK(streamingSize < singlePassSize + (1 << 18) + 3 * ZSTD_BLOCKSIZE_MAX, "Streaming doesn't use the right amount of memory");
    +        CHECK(streamingSize != streaming2KSize + 3 * (ZSTD_BLOCKSIZE_MAX - 2048), "ZSTD_d_blockSizeMax didn't save the right amount of memory");
    +        DISPLAYLEVEL(3, "| %zu | %zu | %zu | ", singlePassSize, streaming2KSize, streamingSize);
    +
    +        ZSTD_freeDCtx(dctx);
    +    }
    +    DISPLAYLEVEL(3, "OK \n");
    +
         /* Decompression with ZSTD_d_stableOutBuffer */
         cSize = ZSTD_compress(compressedBuffer, compressedBufferSize, CNBuffer, CNBufferSize, 1);
         CHECK_Z(cSize);
    @@ -1859,7 +1920,7 @@ static int basicUnitTests(U32 seed, double compressibility, int bigTests)
         DISPLAYLEVEL(3, "test%3i : Block-Level External Sequence Producer API: ", testNb++);
         {
             size_t const dstBufSize = ZSTD_compressBound(CNBufferSize);
    -        BYTE* const dstBuf = (BYTE*)malloc(ZSTD_compressBound(dstBufSize));
    +        BYTE* const dstBuf = (BYTE*)malloc(dstBufSize);
             size_t const checkBufSize = CNBufferSize;
             BYTE* const checkBuf = (BYTE*)malloc(checkBufSize);
             int enableFallback;
    @@ -2295,6 +2356,102 @@ static int basicUnitTests(U32 seed, double compressibility, int bigTests)
         }
         DISPLAYLEVEL(3, "OK \n");
     
    +    DISPLAYLEVEL(3, "test%3i : Testing external sequence producer with static CCtx: ", testNb++);
    +    {
    +        size_t const dstBufSize = ZSTD_compressBound(CNBufferSize);
    +        BYTE* const dstBuf = (BYTE*)malloc(dstBufSize);
    +        size_t const checkBufSize = CNBufferSize;
    +        BYTE* const checkBuf = (BYTE*)malloc(checkBufSize);
    +        ZSTD_CCtx_params* params = ZSTD_createCCtxParams();
    +        ZSTD_CCtx* staticCCtx;
    +        void* cctxBuf;
    +        EMF_testCase seqProdState;
    +
    +        CHECK_Z(ZSTD_CCtxParams_setParameter(params, ZSTD_c_validateSequences, 1));
    +        CHECK_Z(ZSTD_CCtxParams_setParameter(params, ZSTD_c_enableSeqProducerFallback, 0));
    +        ZSTD_CCtxParams_registerSequenceProducer(params, &seqProdState, zstreamSequenceProducer);
    +
    +        {
    +            size_t const cctxSize = ZSTD_estimateCCtxSize_usingCCtxParams(params);
    +            cctxBuf = malloc(cctxSize);
    +            staticCCtx = ZSTD_initStaticCCtx(cctxBuf, cctxSize);
    +            ZSTD_CCtx_setParametersUsingCCtxParams(staticCCtx, params);
    +        }
    +
    +        // Check that compression with external sequence producer succeeds when expected
    +        seqProdState = EMF_LOTS_OF_SEQS;
    +        {
    +            size_t dResult;
    +            size_t const cResult = ZSTD_compress2(staticCCtx, dstBuf, dstBufSize, CNBuffer, CNBufferSize);
    +            CHECK(ZSTD_isError(cResult), "EMF: Compression error: %s", ZSTD_getErrorName(cResult));
    +            dResult = ZSTD_decompress(checkBuf, checkBufSize, dstBuf, cResult);
    +            CHECK(ZSTD_isError(dResult), "EMF: Decompression error: %s", ZSTD_getErrorName(dResult));
    +            CHECK(dResult != CNBufferSize, "EMF: Corruption!");
    +            CHECK(memcmp(CNBuffer, checkBuf, CNBufferSize) != 0, "EMF: Corruption!");
    +        }
    +
    +        // Check that compression with external sequence producer fails when expected
    +        seqProdState = EMF_BIG_ERROR;
    +        {
    +            size_t const cResult = ZSTD_compress2(staticCCtx, dstBuf, dstBufSize, CNBuffer, CNBufferSize);
    +            CHECK(!ZSTD_isError(cResult), "EMF: Should have raised an error!");
    +            CHECK(
    +                ZSTD_getErrorCode(cResult) != ZSTD_error_sequenceProducer_failed,
    +                "EMF: Wrong error code: %s", ZSTD_getErrorName(cResult)
    +            );
    +        }
    +
    +        free(dstBuf);
    +        free(checkBuf);
    +        free(cctxBuf);
    +        ZSTD_freeCCtxParams(params);
    +    }
    +    DISPLAYLEVEL(3, "OK \n");
    +
    +    DISPLAYLEVEL(3, "test%3i : Decoder should reject invalid frame header on legacy frames: ", testNb++);
    +    {
    +        const unsigned char compressed[] = { 0x26,0xb5,0x2f,0xfd,0x50,0x91,0xfd,0xd8,0xb5 };
    +        const size_t compressedSize = 9;
    +        size_t const dSize = ZSTD_decompress(NULL, 0, compressed, compressedSize);
    +        CHECK(!ZSTD_isError(dSize), "must reject when legacy frame header is invalid");
    +    }
    +    DISPLAYLEVEL(3, "OK \n");
    +
    +    DISPLAYLEVEL(3, "test%3i : Test single-shot fallback for magicless mode: ", testNb++);
    +    {
    +        // Aquire resources
    +        size_t const srcSize = COMPRESSIBLE_NOISE_LENGTH;
    +        void* src = malloc(srcSize);
    +        size_t const dstSize = ZSTD_compressBound(srcSize);
    +        void* dst = malloc(dstSize);
    +        size_t const valSize = srcSize;
    +        void* val = malloc(valSize);
    +        ZSTD_inBuffer inBuf = { dst, dstSize, 0 };
    +        ZSTD_outBuffer outBuf = { val, valSize, 0 };
    +        ZSTD_CCtx* cctx = ZSTD_createCCtx();
    +        ZSTD_DCtx* dctx = ZSTD_createDCtx();
    +        CHECK(!src || !dst || !val || !dctx || !cctx, "memory allocation failure");
    +
    +        // Write test data for decompression to dst
    +        RDG_genBuffer(src, srcSize, compressibility, 0.0, 0xdeadbeef);
    +        CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless));
    +        CHECK_Z(ZSTD_compress2(cctx, dst, dstSize, src, srcSize));
    +
    +        // Run decompression
    +        CHECK_Z(ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless));
    +        CHECK_Z(ZSTD_decompressStream(dctx, &outBuf, &inBuf));
    +
    +        // Validate
    +        CHECK(outBuf.pos != srcSize, "decompressed size must match");
    +        CHECK(memcmp(src, val, srcSize) != 0, "decompressed data must match");
    +        
    +        // Cleanup
    +        free(src); free(dst); free(val);
    +        ZSTD_freeCCtx(cctx);
    +        ZSTD_freeDCtx(dctx);
    +    }
    +    DISPLAYLEVEL(3, "OK \n");
    +
     _end:
         FUZ_freeDictionary(dictionary);
         ZSTD_freeCStream(zc);
    @@ -2845,6 +3002,13 @@ static int fuzzerTests_newAPI(U32 seed, int nbTests, int startTest,
                     if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_forceMaxWindow, FUZ_rand(&lseed) & 1, opaqueAPI) );
                     if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_deterministicRefPrefix, FUZ_rand(&lseed) & 1, opaqueAPI) );
     
    +                /* Set max block size parameters */
    +                if (FUZ_rand(&lseed) & 1) {
    +                    int maxBlockSize = (int)(FUZ_rand(&lseed) % ZSTD_BLOCKSIZE_MAX);
    +                    maxBlockSize = MAX(1024, maxBlockSize);
    +                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_maxBlockSize, maxBlockSize, opaqueAPI) );
    +                }
    +
                     /* Apply parameters */
                     if (opaqueAPI) {
                         DISPLAYLEVEL(5, "t%u: applying CCtxParams \n", testNb);
    @@ -2976,6 +3140,13 @@ static int fuzzerTests_newAPI(U32 seed, int nbTests, int startTest,
             if (FUZ_rand(&lseed) & 1) {
                 CHECK_Z(ZSTD_DCtx_setParameter(zd, ZSTD_d_disableHuffmanAssembly, FUZ_rand(&lseed) & 1));
             }
    +        if (FUZ_rand(&lseed) & 1) {
    +            int maxBlockSize;
    +            CHECK_Z(ZSTD_CCtx_getParameter(zc, ZSTD_c_maxBlockSize, &maxBlockSize));
    +            CHECK_Z(ZSTD_DCtx_setParameter(zd, ZSTD_d_maxBlockSize, maxBlockSize));
    +        } else {
    +            CHECK_Z(ZSTD_DCtx_setParameter(zd, ZSTD_d_maxBlockSize, 0));
    +        }
             {   size_t decompressionResult = 1;
                 ZSTD_inBuffer  inBuff = { cBuffer, cSize, 0 };
                 ZSTD_outBuffer outBuff= { dstBuffer, dstBufferSize, 0 };
    diff --git a/third-party/zstd/zlibWrapper/examples/example.c b/third-party/zstd/zlibWrapper/examples/example.c
    index d7590e31..99fbf5b1 100644
    --- a/third-party/zstd/zlibWrapper/examples/example.c
    +++ b/third-party/zstd/zlibWrapper/examples/example.c
    @@ -77,9 +77,7 @@ int  main               _Z_OF((int argc, char *argv[]));
     void *myalloc _Z_OF((void *, unsigned, unsigned));
     void myfree _Z_OF((void *, void *));
     
    -void *myalloc(q, n, m)
    -    void *q;
    -    unsigned n, m;
    +void *myalloc(void *q, unsigned n, unsigned m)
     {
         void *buf = calloc(n, m);
         q = Z_NULL;
    @@ -110,10 +108,8 @@ void test_gzio          _Z_OF((const char *fname,
     /* ===========================================================================
      * Test compress() and uncompress()
      */
    -void test_compress(compr, comprLen, uncompr, uncomprLen)
    -    Byte *compr, *uncompr;
    -    uLong comprLen, uncomprLen;
    -{
    +void test_compress(Byte *compr, uLong comprLen, Byte *uncompr,
    +                   uLong uncomprLen) {
         int err;
         uLong len = (uLong)strlen(hello)+1;
     
    @@ -136,11 +132,7 @@ void test_compress(compr, comprLen, uncompr, uncomprLen)
     /* ===========================================================================
      * Test read/write of .gz files
      */
    -void test_gzio(fname, uncompr, uncomprLen)
    -    const char *fname; /* compressed file name */
    -    Byte *uncompr;
    -    uLong uncomprLen;
    -{
    +void test_gzio(const char *fname, Byte *uncompr, uLong uncomprLen) {
     #ifdef NO_GZCOMPRESS
         fprintf(stderr, "NO_GZCOMPRESS -- gz* functions cannot compress\n");
     #else
    @@ -222,10 +214,7 @@ void test_gzio(fname, uncompr, uncomprLen)
     /* ===========================================================================
      * Test deflate() with small buffers
      */
    -void test_deflate(compr, comprLen)
    -    Byte *compr;
    -    uLong comprLen;
    -{
    +void test_deflate(Byte *compr, uLong comprLen) {
         z_stream c_stream; /* compression stream */
         int err;
         uLong len = (uLong)strlen(hello)+1;
    @@ -260,10 +249,8 @@ void test_deflate(compr, comprLen)
     /* ===========================================================================
      * Test inflate() with small buffers
      */
    -void test_inflate(compr, comprLen, uncompr, uncomprLen)
    -    Byte *compr, *uncompr;
    -    uLong comprLen, uncomprLen;
    -{
    +void test_inflate(Byte *compr, uLong comprLen, Byte *uncompr,
    +                  uLong uncomprLen) {
         int err;
         z_stream d_stream; /* decompression stream */
     
    @@ -301,10 +288,8 @@ void test_inflate(compr, comprLen, uncompr, uncomprLen)
     /* ===========================================================================
      * Test deflate() with large buffers and dynamic change of compression level
      */
    -void test_large_deflate(compr, comprLen, uncompr, uncomprLen)
    -    Byte *compr, *uncompr;
    -    uLong comprLen, uncomprLen;
    -{
    +void test_large_deflate(Byte *compr, uLong comprLen, Byte *uncompr,
    +                        uLong uncomprLen) {
         z_stream c_stream; /* compression stream */
         int err;
     
    @@ -355,11 +340,9 @@ void test_large_deflate(compr, comprLen, uncompr, uncomprLen)
     
     /* ===========================================================================
      * Test inflate() with large buffers
    - */
    -void test_large_inflate(compr, comprLen, uncompr, uncomprLen)
    -    Byte *compr, *uncompr;
    -    uLong comprLen, uncomprLen;
    -{
    + */ 
    +void test_large_inflate(Byte *compr, uLong comprLen, Byte *uncompr,
    +                        uLong uncomprLen) {
         int err;
         z_stream d_stream; /* decompression stream */
     
    @@ -397,10 +380,7 @@ void test_large_inflate(compr, comprLen, uncompr, uncomprLen)
     /* ===========================================================================
      * Test deflate() with full flush
      */
    -void test_flush(compr, comprLen)
    -    Byte *compr;
    -    uLong *comprLen;
    -{
    +void test_flush(Byte *compr, uLong *comprLen) {
         z_stream c_stream; /* compression stream */
         int err;
         uInt len = (uInt)strlen(hello)+1;
    @@ -435,10 +415,7 @@ void test_flush(compr, comprLen)
     /* ===========================================================================
      * Test inflateSync()
      */
    -void test_sync(compr, comprLen, uncompr, uncomprLen)
    -    Byte *compr, *uncompr;
    -    uLong comprLen, uncomprLen;
    -{
    +void test_sync(Byte *compr, uLong comprLen, Byte *uncompr, uLong uncomprLen) {
         int err;
         z_stream d_stream; /* decompression stream */
     
    @@ -479,10 +456,7 @@ void test_sync(compr, comprLen, uncompr, uncomprLen)
     /* ===========================================================================
      * Test deflate() with preset dictionary
      */
    -void test_dict_deflate(compr, comprLen)
    -    Byte *compr;
    -    uLong comprLen;
    -{
    +void test_dict_deflate(Byte *compr, uLong comprLen) {
         z_stream c_stream; /* compression stream */
         int err;
     
    @@ -516,10 +490,8 @@ void test_dict_deflate(compr, comprLen)
     /* ===========================================================================
      * Test inflate() with a preset dictionary
      */
    -void test_dict_inflate(compr, comprLen, uncompr, uncomprLen)
    -    Byte *compr, *uncompr;
    -    uLong comprLen, uncomprLen;
    -{
    +void test_dict_inflate(Byte *compr, uLong comprLen, Byte *uncompr,
    +                       uLong uncomprLen) {
         int err;
         z_stream d_stream; /* decompression stream */
     
    @@ -567,10 +539,7 @@ void test_dict_inflate(compr, comprLen, uncompr, uncomprLen)
      * Usage:  example [output.gz  [input.gz]]
      */
     
    -int main(argc, argv)
    -    int argc;
    -    char *argv[];
    -{
    +int main(int argc, char *argv[]) {
         Byte *compr, *uncompr;
         uLong comprLen = 10000*sizeof(int); /* don't overflow on MSDOS */
         uLong uncomprLen = comprLen;
    diff --git a/third-party/zstd/zlibWrapper/examples/example_original.c b/third-party/zstd/zlibWrapper/examples/example_original.c
    index 5b4e4d1d..828b06c8 100644
    --- a/third-party/zstd/zlibWrapper/examples/example_original.c
    +++ b/third-party/zstd/zlibWrapper/examples/example_original.c
    @@ -102,9 +102,7 @@ void test_gzio          _Z_OF((const char *fname,
     /* ===========================================================================
      * Test compress() and uncompress()
      */
    -void test_compress(compr, comprLen, uncompr, uncomprLen)
    -    Byte *compr, *uncompr;
    -    uLong comprLen, uncomprLen;
    +void test_compress(Byte *compr, uLong comprLen, Byte *uncompr, uLong uncomprLen)
     {
         int err;
         uLong len = (uLong)strlen(hello)+1;
    @@ -128,10 +126,8 @@ void test_compress(compr, comprLen, uncompr, uncomprLen)
     /* ===========================================================================
      * Test read/write of .gz files
      */
    -void test_gzio(fname, uncompr, uncomprLen)
    -    const char *fname; /* compressed file name */
    -    Byte *uncompr;
    -    uLong uncomprLen;
    +void test_gzio(const char *fname /* compressed file name */, Byte *uncompr,
    +    uLong uncomprLen)
     {
     #ifdef NO_GZCOMPRESS
         fprintf(stderr, "NO_GZCOMPRESS -- gz* functions cannot compress\n");
    @@ -214,9 +210,7 @@ void test_gzio(fname, uncompr, uncomprLen)
     /* ===========================================================================
      * Test deflate() with small buffers
      */
    -void test_deflate(compr, comprLen)
    -    Byte *compr;
    -    uLong comprLen;
    +void test_deflate(Byte *compr, uLong comprLen)
     {
         z_stream c_stream; /* compression stream */
         int err;
    @@ -252,9 +246,7 @@ void test_deflate(compr, comprLen)
     /* ===========================================================================
      * Test inflate() with small buffers
      */
    -void test_inflate(compr, comprLen, uncompr, uncomprLen)
    -    Byte *compr, *uncompr;
    -    uLong comprLen, uncomprLen;
    +void test_inflate(Byte *compr, uLong comprLen, Byte *uncompr, uLong uncomprLen)
     {
         int err;
         z_stream d_stream; /* decompression stream */
    @@ -293,9 +285,8 @@ void test_inflate(compr, comprLen, uncompr, uncomprLen)
     /* ===========================================================================
      * Test deflate() with large buffers and dynamic change of compression level
      */
    -void test_large_deflate(compr, comprLen, uncompr, uncomprLen)
    -    Byte *compr, *uncompr;
    -    uLong comprLen, uncomprLen;
    +void test_large_deflate(Byte *compr, uLong comprLen, Byte *uncompr,
    +    uLong uncomprLen)
     {
         z_stream c_stream; /* compression stream */
         int err;
    @@ -348,9 +339,8 @@ void test_large_deflate(compr, comprLen, uncompr, uncomprLen)
     /* ===========================================================================
      * Test inflate() with large buffers
      */
    -void test_large_inflate(compr, comprLen, uncompr, uncomprLen)
    -    Byte *compr, *uncompr;
    -    uLong comprLen, uncomprLen;
    +void test_large_inflate(Byte *compr, uLong comprLen, Byte *uncompr,
    +    uLong uncomprLen)
     {
         int err;
         z_stream d_stream; /* decompression stream */
    @@ -389,9 +379,7 @@ void test_large_inflate(compr, comprLen, uncompr, uncomprLen)
     /* ===========================================================================
      * Test deflate() with full flush
      */
    -void test_flush(compr, comprLen)
    -    Byte *compr;
    -    uLong *comprLen;
    +void test_flush(Byte *compr, uLong comprLen)
     {
         z_stream c_stream; /* compression stream */
         int err;
    @@ -427,9 +415,7 @@ void test_flush(compr, comprLen)
     /* ===========================================================================
      * Test inflateSync()
      */
    -void test_sync(compr, comprLen, uncompr, uncomprLen)
    -    Byte *compr, *uncompr;
    -    uLong comprLen, uncomprLen;
    +void test_sync(Byte *compr, uLong comprLen, Byte *uncompr, uLong uncomprLen)
     {
         int err;
         z_stream d_stream; /* decompression stream */
    @@ -471,9 +457,7 @@ void test_sync(compr, comprLen, uncompr, uncomprLen)
     /* ===========================================================================
      * Test deflate() with preset dictionary
      */
    -void test_dict_deflate(compr, comprLen)
    -    Byte *compr;
    -    uLong comprLen;
    +void test_dict_deflate(Byte *compr, uLong comprLen)
     {
         z_stream c_stream; /* compression stream */
         int err;
    @@ -508,9 +492,8 @@ void test_dict_deflate(compr, comprLen)
     /* ===========================================================================
      * Test inflate() with a preset dictionary
      */
    -void test_dict_inflate(compr, comprLen, uncompr, uncomprLen)
    -    Byte *compr, *uncompr;
    -    uLong comprLen, uncomprLen;
    +void test_dict_inflate(Byte *compr, uLong comprLen, Byte *uncompr,
    +    uLong uncomprLen)
     {
         int err;
         z_stream d_stream; /* decompression stream */
    @@ -559,9 +542,7 @@ void test_dict_inflate(compr, comprLen, uncompr, uncomprLen)
      * Usage:  example [output.gz  [input.gz]]
      */
     
    -int main(argc, argv)
    -    int argc;
    -    char *argv[];
    +int main(int argc, char *argv[])
     {
         Byte *compr, *uncompr;
         uLong comprLen = 10000*sizeof(int); /* don't overflow on MSDOS */
    diff --git a/third-party/zstd/zlibWrapper/examples/minigzip.c b/third-party/zstd/zlibWrapper/examples/minigzip.c
    index 717a94df..1af81520 100644
    --- a/third-party/zstd/zlibWrapper/examples/minigzip.c
    +++ b/third-party/zstd/zlibWrapper/examples/minigzip.c
    @@ -34,7 +34,7 @@
     #  include 
     #endif
     
    -#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
    +#if defined(MSDOS) || defined(OS2) || defined(_WIN32) || defined(__CYGWIN__)
     #  include 
     #  include 
     #  ifdef UNDER_CE
    @@ -63,7 +63,7 @@
     #endif
     
     #if !defined(Z_HAVE_UNISTD_H) && !defined(_LARGEFILE64_SOURCE)
    -#ifndef WIN32 /* unlink already in stdio.h for WIN32 */
    +#ifndef _WIN32 /* unlink already in stdio.h for WIN32 */
       extern int unlink _Z_OF((const char *));
     #endif
     #endif
    @@ -82,8 +82,7 @@
        The strwinerror function does not change the current setting
        of GetLastError.  */
     
    -static char *strwinerror (error)
    -     DWORD error;
    +static char *strwinerror(DWORD error)
     {
         static char buf[1024];
     
    @@ -121,8 +120,7 @@ static char *strwinerror (error)
         return buf;
     }
     
    -static void pwinerror (s)
    -    const char *s;
    +static void pwinerror (const char *s)
     {
         if (s && *s)
             fprintf(stderr, "%s: %s\n", s, strwinerror(GetLastError ()));
    @@ -198,11 +196,7 @@ const char *mode;
         return gz_open(NULL, fd, mode);
     }
     
    -gzFile gz_open(path, fd, mode)
    -    const char *path;
    -    int fd;
    -    const char *mode;
    -{
    +gzFile gz_open(const char *path, int fd, const char *mode) {
         gzFile gz;
         int ret;
     
    @@ -238,11 +232,7 @@ gzFile gz_open(path, fd, mode)
     
     int gzwrite _Z_OF((gzFile, const void *, unsigned));
     
    -int gzwrite(gz, buf, len)
    -    gzFile gz;
    -    const void *buf;
    -    unsigned len;
    -{
    +int gzwrite(gzFile gz, const void *buf, unsigned len) {
         z_stream *strm;
         unsigned char out[BUFLEN];
     
    @@ -262,11 +252,7 @@ int gzwrite(gz, buf, len)
     
     int gzread _Z_OF((gzFile, void *, unsigned));
     
    -int gzread(gz, buf, len)
    -    gzFile gz;
    -    void *buf;
    -    unsigned len;
    -{
    +int gzread(gzFile gz, void *buf, unsigned len) {
         int ret;
         unsigned got;
         unsigned char in[1];
    @@ -299,9 +285,7 @@ int gzread(gz, buf, len)
     
     int gzclose _Z_OF((gzFile));
     
    -int gzclose(gz)
    -    gzFile gz;
    -{
    +int gzclose(gzFile gz) {
         z_stream *strm;
         unsigned char out[BUFLEN];
     
    @@ -328,9 +312,7 @@ int gzclose(gz)
     
     const char *gzerror _Z_OF((gzFile, int *));
     
    -const char *gzerror(gz, err)
    -    gzFile gz;
    -    int *err;
    +const char *gzerror(gzFile gz, int *err)
     {
         *err = gz->err;
         return gz->msg;
    @@ -353,8 +335,7 @@ int  main             _Z_OF((int argc, char *argv[]));
     /* ===========================================================================
      * Display error message and exit
      */
    -void error(msg)
    -    const char *msg;
    +void error(const char *msg)
     {
         fprintf(stderr, "%s: %s\n", prog, msg);
         exit(1);
    @@ -364,9 +345,7 @@ void error(msg)
      * Compress input to output then close both files.
      */
     
    -void gz_compress(in, out)
    -    FILE   *in;
    -    gzFile out;
    +void gz_compress(FILE *in, gzFile out)
     {
         local char buf[BUFLEN];
         int len;
    @@ -397,10 +376,7 @@ void gz_compress(in, out)
     /* Try compressing the input file at once using mmap. Return Z_OK if
      * if success, Z_ERRNO otherwise.
      */
    -int gz_compress_mmap(in, out)
    -    FILE   *in;
    -    gzFile out;
    -{
    +int gz_compress_mmap(FILE *in, gzFile out) {
         int len;
         int err;
         int ifd = fileno(in);
    @@ -432,10 +408,7 @@ int gz_compress_mmap(in, out)
     /* ===========================================================================
      * Uncompress input to output then close both files.
      */
    -void gz_uncompress(in, out)
    -    gzFile in;
    -    FILE   *out;
    -{
    +void gz_uncompress(gzFile in, FILE *out) {
         local char buf[BUFLEN];
         int len;
         int err;
    @@ -459,10 +432,7 @@ void gz_uncompress(in, out)
      * Compress the given file: create a corresponding .gz file and remove the
      * original.
      */
    -void file_compress(file, mode)
    -    char  *file;
    -    char  *mode;
    -{
    +void file_compress(char *file, char *mode) {
         local char outfile[MAX_NAME_LEN];
         FILE  *in;
         gzFile out;
    @@ -494,9 +464,7 @@ void file_compress(file, mode)
     /* ===========================================================================
      * Uncompress the given file and remove the original.
      */
    -void file_uncompress(file)
    -    char  *file;
    -{
    +void file_uncompress(char *file) {
         local char buf[MAX_NAME_LEN];
         char *infile, *outfile;
         FILE  *out;
    @@ -546,10 +514,7 @@ void file_uncompress(file)
      *   -1 to -9 : compression level
      */
     
    -int main(argc, argv)
    -    int argc;
    -    char *argv[];
    -{
    +int main(int argc, char *argv[]) {
         int copyout = 0;
         int uncompr = 0;
         gzFile file;
    diff --git a/third-party/zstd/zlibWrapper/gzclose.c b/third-party/zstd/zlibWrapper/gzclose.c
    index ba43b8c5..12a2dfc5 100644
    --- a/third-party/zstd/zlibWrapper/gzclose.c
    +++ b/third-party/zstd/zlibWrapper/gzclose.c
    @@ -11,9 +11,7 @@
     /* gzclose() is in a separate file so that it is linked in only if it is used.
        That way the other gzclose functions can be used instead to avoid linking in
        unneeded compression or decompression routines. */
    -int ZEXPORT gzclose(file)
    -    gzFile file;
    -{
    +int ZEXPORT gzclose(gzFile file) {
     #ifndef NO_GZCOMPRESS
         gz_statep state;
     
    diff --git a/third-party/zstd/zlibWrapper/gzlib.c b/third-party/zstd/zlibWrapper/gzlib.c
    index eea480a7..c7265151 100644
    --- a/third-party/zstd/zlibWrapper/gzlib.c
    +++ b/third-party/zstd/zlibWrapper/gzlib.c
    @@ -33,9 +33,7 @@ local gzFile gz_open _Z_OF((const void *, int, const char *));
     
        The gz_strwinerror function does not change the current setting of
        GetLastError. */
    -char ZLIB_INTERNAL *gz_strwinerror (error)
    -     DWORD error;
    -{
    +char ZLIB_INTERNAL *gz_strwinerror(DWORD error) {
         static char buf[1024];
     
         wchar_t *msgbuf;
    @@ -75,9 +73,7 @@ char ZLIB_INTERNAL *gz_strwinerror (error)
     #endif /* UNDER_CE */
     
     /* Reset gzip file state */
    -local void gz_reset(state)
    -    gz_statep state;
    -{
    +local void gz_reset(gz_statep state) {
         state.state->x.have = 0;              /* no output data available */
         if (state.state->mode == GZ_READ) {   /* for reading ... */
             state.state->eof = 0;             /* not at end of file */
    @@ -91,11 +87,7 @@ local void gz_reset(state)
     }
     
     /* Open a gzip file either by name or file descriptor. */
    -local gzFile gz_open(path, fd, mode)
    -    const void *path;
    -    int fd;
    -    const char *mode;
    -{
    +local gzFile gz_open(const void *path, int fd, const char *mode) {
         gz_statep state;
         z_size_t len;
         int oflag;
    @@ -270,26 +262,17 @@ local gzFile gz_open(path, fd, mode)
     }
     
     /* -- see zlib.h -- */
    -gzFile ZEXPORT gzopen(path, mode)
    -    const char *path;
    -    const char *mode;
    -{
    +gzFile ZEXPORT gzopen(const char *path, const char *mode) {
         return gz_open(path, -1, mode);
     }
     
     /* -- see zlib.h -- */
    -gzFile ZEXPORT gzopen64(path, mode)
    -    const char *path;
    -    const char *mode;
    -{
    +gzFile ZEXPORT gzopen64(const char *path, const char *mode) {
         return gz_open(path, -1, mode);
     }
     
     /* -- see zlib.h -- */
    -gzFile ZEXPORT gzdopen(fd, mode)
    -    int fd;
    -    const char *mode;
    -{
    +gzFile ZEXPORT gzdopen(int fd, const char *mode) {
         char *path;         /* identifier for error messages */
         gzFile gz;
     
    @@ -307,19 +290,13 @@ gzFile ZEXPORT gzdopen(fd, mode)
     
     /* -- see zlib.h -- */
     #ifdef WIDECHAR
    -gzFile ZEXPORT gzopen_w(path, mode)
    -    const wchar_t *path;
    -    const char *mode;
    -{
    +gzFile ZEXPORT gzopen_w(const wchar_t *path, const char *mode) {
         return gz_open(path, -2, mode);
     }
     #endif
     
     /* -- see zlib.h -- */
    -int ZEXPORT gzbuffer(file, size)
    -    gzFile file;
    -    unsigned size;
    -{
    +int ZEXPORT gzbuffer(gzFile file, unsigned size) {
         gz_statep state;
     
         /* get internal structure and check integrity */
    @@ -343,9 +320,7 @@ int ZEXPORT gzbuffer(file, size)
     }
     
     /* -- see zlib.h -- */
    -int ZEXPORT gzrewind(file)
    -    gzFile file;
    -{
    +int ZEXPORT gzrewind(gzFile file) {
         gz_statep state;
     
         /* get internal structure */
    @@ -366,11 +341,7 @@ int ZEXPORT gzrewind(file)
     }
     
     /* -- see zlib.h -- */
    -z_off64_t ZEXPORT gzseek64(file, offset, whence)
    -    gzFile file;
    -    z_off64_t offset;
    -    int whence;
    -{
    +z_off64_t ZEXPORT gzseek64(gzFile file, z_off64_t offset, int whence) {
         unsigned n;
         z_off64_t ret;
         gz_statep state;
    @@ -443,11 +414,7 @@ z_off64_t ZEXPORT gzseek64(file, offset, whence)
     }
     
     /* -- see zlib.h -- */
    -z_off_t ZEXPORT gzseek(file, offset, whence)
    -    gzFile file;
    -    z_off_t offset;
    -    int whence;
    -{
    +z_off_t ZEXPORT gzseek(gzFile file, z_off_t offset, int whence) {
         z_off64_t ret;
     
         ret = gzseek64(file, (z_off64_t)offset, whence);
    @@ -455,9 +422,7 @@ z_off_t ZEXPORT gzseek(file, offset, whence)
     }
     
     /* -- see zlib.h -- */
    -z_off64_t ZEXPORT gztell64(file)
    -    gzFile file;
    -{
    +z_off64_t ZEXPORT gztell64(gzFile file) {
         gz_statep state;
     
         /* get internal structure and check integrity */
    @@ -472,9 +437,7 @@ z_off64_t ZEXPORT gztell64(file)
     }
     
     /* -- see zlib.h -- */
    -z_off_t ZEXPORT gztell(file)
    -    gzFile file;
    -{
    +z_off_t ZEXPORT gztell(gzFile file) {
         z_off64_t ret;
     
         ret = gztell64(file);
    @@ -482,9 +445,7 @@ z_off_t ZEXPORT gztell(file)
     }
     
     /* -- see zlib.h -- */
    -z_off64_t ZEXPORT gzoffset64(file)
    -    gzFile file;
    -{
    +z_off64_t ZEXPORT gzoffset64(gzFile file) {
         z_off64_t offset;
         gz_statep state;
     
    @@ -505,9 +466,7 @@ z_off64_t ZEXPORT gzoffset64(file)
     }
     
     /* -- see zlib.h -- */
    -z_off_t ZEXPORT gzoffset(file)
    -    gzFile file;
    -{
    +z_off_t ZEXPORT gzoffset(gzFile file) {
         z_off64_t ret;
     
         ret = gzoffset64(file);
    @@ -515,9 +474,7 @@ z_off_t ZEXPORT gzoffset(file)
     }
     
     /* -- see zlib.h -- */
    -int ZEXPORT gzeof(file)
    -    gzFile file;
    -{
    +int ZEXPORT gzeof(gzFile file) {
         gz_statep state;
     
         /* get internal structure and check integrity */
    @@ -532,10 +489,7 @@ int ZEXPORT gzeof(file)
     }
     
     /* -- see zlib.h -- */
    -const char * ZEXPORT gzerror(file, errnum)
    -    gzFile file;
    -    int *errnum;
    -{
    +const char * ZEXPORT gzerror(gzFile file, int *errnum) {
         gz_statep state;
     
         /* get internal structure and check integrity */
    @@ -553,9 +507,7 @@ const char * ZEXPORT gzerror(file, errnum)
     }
     
     /* -- see zlib.h -- */
    -void ZEXPORT gzclearerr(file)
    -    gzFile file;
    -{
    +void ZEXPORT gzclearerr(gzFile file) {
         gz_statep state;
     
         /* get internal structure and check integrity */
    @@ -579,11 +531,7 @@ void ZEXPORT gzclearerr(file)
        memory).  Simply save the error message as a static string.  If there is an
        allocation failure constructing the error message, then convert the error to
        out of memory. */
    -void ZLIB_INTERNAL gz_error(state, err, msg)
    -    gz_statep state;
    -    int err;
    -    const char *msg;
    -{
    +void ZLIB_INTERNAL gz_error(gz_statep state, int err, const char *msg) {
         /* free previously allocated message and clear */
         if (state.state->msg != NULL) {
             if (state.state->err != Z_MEM_ERROR)
    @@ -625,8 +573,7 @@ void ZLIB_INTERNAL gz_error(state, err, msg)
        available) -- we need to do this to cover cases where 2's complement not
        used, since C standard permits 1's complement and sign-bit representations,
        otherwise we could just use ((unsigned)-1) >> 1 */
    -unsigned ZLIB_INTERNAL gz_intmax()
    -{
    +unsigned ZLIB_INTERNAL gz_intmax() {
         unsigned p, q;
     
         p = 1;
    diff --git a/third-party/zstd/zlibWrapper/gzread.c b/third-party/zstd/zlibWrapper/gzread.c
    index 584fad1e..ed3c1782 100644
    --- a/third-party/zstd/zlibWrapper/gzread.c
    +++ b/third-party/zstd/zlibWrapper/gzread.c
    @@ -29,12 +29,8 @@ local z_size_t gz_read _Z_OF((gz_statep, voidp, z_size_t));
        state.state->fd, and update state.state->eof, state.state->err, and state.state->msg as appropriate.
        This function needs to loop on read(), since read() is not guaranteed to
        read the number of bytes requested, depending on the type of descriptor. */
    -local int gz_load(state, buf, len, have)
    -    gz_statep state;
    -    unsigned char *buf;
    -    unsigned len;
    -    unsigned *have;
    -{
    +local int gz_load(gz_statep state, unsigned char *buf, unsigned len,
    +                  unsigned *have) {
         ssize_t ret;
         unsigned get, max = ((unsigned)-1 >> 2) + 1;
     
    @@ -64,8 +60,7 @@ local int gz_load(state, buf, len, have)
        If strm->avail_in != 0, then the current data is moved to the beginning of
        the input buffer, and then the remainder of the buffer is loaded with the
        available data from the input file. */
    -local int gz_avail(state)
    -    gz_statep state;
    +local int gz_avail(gz_statep state)
     {
         unsigned got;
         z_streamp strm = &(state.state->strm);
    @@ -99,9 +94,7 @@ local int gz_avail(state)
        case, all further file reads will be directly to either the output buffer or
        a user buffer.  If decompressing, the inflate state will be initialized.
        gz_look() will return 0 on success or -1 on failure. */
    -local int gz_look(state)
    -    gz_statep state;
    -{
    +local int gz_look(gz_statep state) {
         z_streamp strm = &(state.state->strm);
     
         /* allocate read buffers and inflate memory */
    @@ -184,9 +177,7 @@ local int gz_look(state)
        data.  If the gzip stream completes, state.state->how is reset to LOOK to look for
        the next gzip stream or raw data, once state.state->x.have is depleted.  Returns 0
        on success, -1 on failure. */
    -local int gz_decomp(state)
    -    gz_statep state;
    -{
    +local int gz_decomp(gz_statep state) {
         int ret = Z_OK;
         unsigned had;
         z_streamp strm = &(state.state->strm);
    @@ -238,9 +229,7 @@ local int gz_decomp(state)
        looked for to determine whether to copy or decompress.  Returns -1 on error,
        otherwise 0.  gz_fetch() will leave state.state->how as COPY or GZIP unless the
        end of the input file has been reached and all data has been processed.  */
    -local int gz_fetch(state)
    -    gz_statep state;
    -{
    +local int gz_fetch(gz_statep state) {
         z_streamp strm = &(state.state->strm);
     
         do {
    @@ -268,10 +257,7 @@ local int gz_fetch(state)
     }
     
     /* Skip len uncompressed bytes of output.  Return -1 on error, 0 on success. */
    -local int gz_skip(state, len)
    -    gz_statep state;
    -    z_off64_t len;
    -{
    +local int gz_skip(gz_statep state, z_off64_t len) {
         unsigned n;
     
         /* skip over len bytes or reach end-of-file, whichever comes first */
    @@ -303,11 +289,7 @@ local int gz_skip(state, len)
        input.  Return the number of bytes read.  If zero is returned, either the
        end of file was reached, or there was an error.  state.state->err must be
        consulted in that case to determine which. */
    -local z_size_t gz_read(state, buf, len)
    -    gz_statep state;
    -    voidp buf;
    -    z_size_t len;
    -{
    +local z_size_t gz_read(gz_statep state, voidp buf, z_size_t len) {
         z_size_t got;
         unsigned n;
     
    @@ -384,11 +366,7 @@ local z_size_t gz_read(state, buf, len)
     }
     
     /* -- see zlib.h -- */
    -int ZEXPORT gzread(file, buf, len)
    -    gzFile file;
    -    voidp buf;
    -    unsigned len;
    -{
    +int ZEXPORT gzread(gzFile file, voidp buf, unsigned len) {
         gz_statep state;
     
         /* get internal structure */
    @@ -420,12 +398,8 @@ int ZEXPORT gzread(file, buf, len)
     }
     
     /* -- see zlib.h -- */
    -z_size_t ZEXPORT gzfread(buf, size, nitems, file)
    -    voidp buf;
    -    z_size_t size;
    -    z_size_t nitems;
    -    gzFile file;
    -{
    +z_size_t ZEXPORT gzfread(voidp buf, z_size_t size, z_size_t nitems,
    +                         gzFile file) {
         z_size_t len;
         gz_statep state;
     
    @@ -468,9 +442,7 @@ ZEXTERN int ZEXPORT gzgetc _Z_OF((gzFile file));
     ZEXTERN int ZEXPORT gzgetc_ _Z_OF((gzFile file));
     #endif
     
    -int ZEXPORT gzgetc(file)
    -    gzFile file;
    -{
    +int ZEXPORT gzgetc(gzFile file) {
         int ret;
         unsigned char buf[1];
         gz_statep state;
    @@ -497,17 +469,12 @@ int ZEXPORT gzgetc(file)
         return ret < 1 ? -1 : buf[0];
     }
     
    -int ZEXPORT gzgetc_(file)
    -gzFile file;
    -{
    +int ZEXPORT gzgetc_(gzFile file) {
         return gzgetc(file);
     }
     
     /* -- see zlib.h -- */
    -int ZEXPORT gzungetc(c, file)
    -    int c;
    -    gzFile file;
    -{
    +int ZEXPORT gzungetc(int c, gzFile file) {
         gz_statep state;
     
         /* get internal structure */
    @@ -564,11 +531,7 @@ int ZEXPORT gzungetc(c, file)
     }
     
     /* -- see zlib.h -- */
    -char * ZEXPORT gzgets(file, buf, len)
    -    gzFile file;
    -    char *buf;
    -    int len;
    -{
    +char * ZEXPORT gzgets(gzFile file, char *buf, int len) {
         unsigned left, n;
         char *str;
         unsigned char *eol;
    @@ -628,9 +591,7 @@ char * ZEXPORT gzgets(file, buf, len)
     }
     
     /* -- see zlib.h -- */
    -int ZEXPORT gzdirect(file)
    -    gzFile file;
    -{
    +int ZEXPORT gzdirect(gzFile file) {
         gz_statep state;
     
         /* get internal structure */
    @@ -648,9 +609,7 @@ int ZEXPORT gzdirect(file)
     }
     
     /* -- see zlib.h -- */
    -int ZEXPORT gzclose_r(file)
    -    gzFile file;
    -{
    +int ZEXPORT gzclose_r(gzFile file) {
         int ret, err;
         gz_statep state;
     
    diff --git a/third-party/zstd/zlibWrapper/gzwrite.c b/third-party/zstd/zlibWrapper/gzwrite.c
    index ccd4f71f..81da1531 100644
    --- a/third-party/zstd/zlibWrapper/gzwrite.c
    +++ b/third-party/zstd/zlibWrapper/gzwrite.c
    @@ -19,9 +19,7 @@ local z_size_t gz_write _Z_OF((gz_statep, voidpc, z_size_t));
     /* Initialize state for writing a gzip file.  Mark initialization by setting
        state.state->size to non-zero.  Return -1 on a memory allocation failure, or 0 on
        success. */
    -local int gz_init(state)
    -    gz_statep state;
    -{
    +local int gz_init(gz_statep state) {
         int ret;
         z_streamp strm = &(state.state->strm);
     
    @@ -75,10 +73,7 @@ local int gz_init(state)
        deflate() flush value.  If flush is Z_FINISH, then the deflate() state is
        reset to start a new gzip stream.  If gz->direct is true, then simply write
        to the output file without compressing, and ignore flush. */
    -local int gz_comp(state, flush)
    -    gz_statep state;
    -    int flush;
    -{
    +local int gz_comp(gz_statep state, int flush) {
         int ret, writ;
         unsigned have, put, max = ((unsigned)-1 >> 2) + 1;
         z_streamp strm = &(state.state->strm);
    @@ -147,10 +142,7 @@ local int gz_comp(state, flush)
     
     /* Compress len zeros to output.  Return -1 on a write error or memory
        allocation failure by gz_comp(), or 0 on success. */
    -local int gz_zero(state, len)
    -    gz_statep state;
    -    z_off64_t len;
    -{
    +local int gz_zero(gz_statep state, z_off64_t len) {
         int first;
         unsigned n;
         z_streamp strm = &(state.state->strm);
    @@ -180,11 +172,7 @@ local int gz_zero(state, len)
     
     /* Write len bytes from buf to file.  Return the number of bytes written.  If
        the returned value is less than len, then there was an error. */
    -local z_size_t gz_write(state, buf, len)
    -    gz_statep state;
    -    voidpc buf;
    -    z_size_t len;
    -{
    +local z_size_t gz_write(gz_statep state, voidpc buf, z_size_t len) {
         z_size_t put = len;
     
         /* if len is zero, avoid unnecessary operations */
    @@ -248,11 +236,7 @@ local z_size_t gz_write(state, buf, len)
     }
     
     /* -- see zlib.h -- */
    -int ZEXPORT gzwrite(file, buf, len)
    -    gzFile file;
    -    voidpc buf;
    -    unsigned len;
    -{
    +int ZEXPORT gzwrite(gzFile file, voidpc buf, unsigned len) {
         gz_statep state;
     
         /* get internal structure */
    @@ -276,12 +260,8 @@ int ZEXPORT gzwrite(file, buf, len)
     }
     
     /* -- see zlib.h -- */
    -z_size_t ZEXPORT gzfwrite(buf, size, nitems, file)
    -    voidpc buf;
    -    z_size_t size;
    -    z_size_t nitems;
    -    gzFile file;
    -{
    +z_size_t ZEXPORT gzfwrite(voidpc buf, z_size_t size, z_size_t nitems,
    +                          gzFile file) {
         z_size_t len;
         gz_statep state;
     
    @@ -307,10 +287,7 @@ z_size_t ZEXPORT gzfwrite(buf, size, nitems, file)
     }
     
     /* -- see zlib.h -- */
    -int ZEXPORT gzputc(file, c)
    -    gzFile file;
    -    int c;
    -{
    +int ZEXPORT gzputc(gzFile file, int c) {
         unsigned have;
         unsigned char buf[1];
         gz_statep state;
    @@ -355,10 +332,7 @@ int ZEXPORT gzputc(file, c)
     }
     
     /* -- see zlib.h -- */
    -int ZEXPORT gzputs(file, str)
    -    gzFile file;
    -    const char *str;
    -{
    +int ZEXPORT gzputs(gzFile file, const char *str) {
         int ret;
         z_size_t len;
         gz_statep state;
    @@ -382,8 +356,7 @@ int ZEXPORT gzputs(file, str)
     #include 
     
     /* -- see zlib.h -- */
    -int ZEXPORTVA gzvprintf(gzFile file, const char *format, va_list va)
    -{
    +int ZEXPORTVA gzvprintf(gzFile file, const char *format, va_list va) {
         int len;
         unsigned left;
         char *next;
    @@ -454,8 +427,7 @@ int ZEXPORTVA gzvprintf(gzFile file, const char *format, va_list va)
         return len;
     }
     
    -int ZEXPORTVA gzprintf(gzFile file, const char *format, ...)
    -{
    +int ZEXPORTVA gzprintf(gzFile file, const char *format, ...) {
         va_list va;
         int ret;
     
    @@ -468,13 +440,10 @@ int ZEXPORTVA gzprintf(gzFile file, const char *format, ...)
     #else /* !STDC && !Z_HAVE_STDARG_H */
     
     /* -- see zlib.h -- */
    -int ZEXPORTVA gzprintf (file, format, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10,
    -                       a11, a12, a13, a14, a15, a16, a17, a18, a19, a20)
    -    gzFile file;
    -    const char *format;
    -    int a1, a2, a3, a4, a5, a6, a7, a8, a9, a10,
    -        a11, a12, a13, a14, a15, a16, a17, a18, a19, a20;
    -{
    +int ZEXPORTVA gzprintf(gzFile file, const char *format, int a1, int a2, int a3,
    +                       int a4, int a5, int a6, int a7, int a8, int a9, int a10,
    +                       int a11, int a12, int a13, int a14, int a15, int a16,
    +                       int a17, int a18, int a19, int a20) {
         unsigned len, left;
         char *next;
         gz_statep state;
    @@ -556,10 +525,7 @@ int ZEXPORTVA gzprintf (file, format, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10,
     #endif
     
     /* -- see zlib.h -- */
    -int ZEXPORT gzflush(file, flush)
    -    gzFile file;
    -    int flush;
    -{
    +int ZEXPORT gzflush(gzFile file, int flush) {
         gz_statep state;
     
         /* get internal structure */
    @@ -588,11 +554,7 @@ int ZEXPORT gzflush(file, flush)
     }
     
     /* -- see zlib.h -- */
    -int ZEXPORT gzsetparams(file, level, strategy)
    -    gzFile file;
    -    int level;
    -    int strategy;
    -{
    +int ZEXPORT gzsetparams(gzFile file, int level, int strategy) {
         gz_statep state;
         z_streamp strm;
     
    @@ -630,9 +592,7 @@ int ZEXPORT gzsetparams(file, level, strategy)
     }
     
     /* -- see zlib.h -- */
    -int ZEXPORT gzclose_w(file)
    -    gzFile file;
    -{
    +int ZEXPORT gzclose_w(gzFile file) {
         int ret = Z_OK;
         gz_statep state;
     
    
    From b845be78b55364cc6f18c8e1bec5013a057a380b Mon Sep 17 00:00:00 2001
    From: Sylvestre Ledru 
    Date: Wed, 25 Sep 2024 12:29:03 +0200
    Subject: [PATCH 2/4] New upstream release
    
    ---
     debian/changelog                             |  8 +-
     debian/patches/encoded-package-metadata.diff | 79 --------------------
     debian/patches/env-package-metadata.diff     | 17 -----
     debian/patches/fix-armhf-build.diff          | 24 ------
     debian/patches/series                        |  4 +-
     5 files changed, 4 insertions(+), 128 deletions(-)
     delete mode 100644 debian/patches/encoded-package-metadata.diff
     delete mode 100644 debian/patches/env-package-metadata.diff
     delete mode 100644 debian/patches/fix-armhf-build.diff
    
    diff --git a/debian/changelog b/debian/changelog
    index cedfce47..c4a49a19 100644
    --- a/debian/changelog
    +++ b/debian/changelog
    @@ -1,10 +1,8 @@
    -mold (2.32.1+dfsg-3) UNRELEASED; urgency=medium
    +mold (2.34.0+dfsg-1) unstable; urgency=medium
     
    -  * Add --encoded-package-metadata option (pulled from upstream, #1308).
    -  * When no package-metadata option is given, fall-back to the
    -    envvar ELF_PACKAGE_METADATA.
    +  * New upstream release
     
    - -- Matthias Klose   Tue, 06 Aug 2024 13:29:29 +0200
    + -- Sylvestre Ledru   Wed, 25 Sep 2024 12:30:51 +0200
     
     mold (2.32.1+dfsg-2) unstable; urgency=medium
     
    diff --git a/debian/patches/encoded-package-metadata.diff b/debian/patches/encoded-package-metadata.diff
    deleted file mode 100644
    index 7a7a8ed6..00000000
    --- a/debian/patches/encoded-package-metadata.diff
    +++ /dev/null
    @@ -1,79 +0,0 @@
    ---- a/elf/cmdline.cc
    -+++ b/elf/cmdline.cc
    -@@ -119,6 +119,8 @@ Options:
    -   --oformat=binary            Omit ELF, section, and program headers
    -   --pack-dyn-relocs=[relr,none]
    -                               Pack dynamic relocations
    -+  --encoded-package-metadata=PERCENT_ENCODED_STRING
    -+                              Set a given string to .note.package
    -   --package-metadata=STRING   Set a given string to .note.package
    -   --perf                      Print performance statistics
    -   --pie, --pic-executable     Create a position-independent executable
    -@@ -406,6 +408,49 @@ split_by_comma_or_colon(std::string_view
    -   return vec;
    - }
    - 
    -+/* Decode a hexadecimal character. Return -1 on error. */
    -+static int hexdecode(char c) {
    -+  if ('0' <= c && c <= '9')
    -+    return c - '0';
    -+  if ('A' <= c && c <= 'F')
    -+    return c - 'A' + 10;
    -+  if ('a' <= c && c <= 'f')
    -+    return c - 'a' + 10;
    -+  return -1;
    -+}
    -+
    -+template 
    -+static std::string parse_percent_encoded_string(Context &ctx, std::string opt, std::string_view arg) {
    -+  std::string decoded;
    -+  int step = 1;
    -+  for (i64 i = 0; i < arg.size(); i += step) {
    -+    step = 1;
    -+    if (arg[i] != '%') {
    -+      decoded += arg[i];
    -+      continue;
    -+    }
    -+    if (i + 1 > arg.size()) {
    -+      Fatal(ctx) << "option --" << opt << ": invalid percent-encoded string: " << arg;
    -+    }
    -+    step++;
    -+    if (arg[i+1] == '%') {
    -+      decoded += '%';
    -+      continue;
    -+    }
    -+    if (i + 2 > arg.size()) {
    -+      Fatal(ctx) << "option --" << opt << ": invalid percent-encoded string: " << arg;
    -+    }
    -+    step++;
    -+    int hex1 = hexdecode(arg[i+1]);
    -+    int hex2 = hexdecode(arg[i+2]);
    -+    if (hex1 == -1 || hex2 == -1) {
    -+      Fatal(ctx) << "option --" << opt << ": invalid percent-encoded string: " << arg;
    -+    }
    -+    decoded += (char) ((hex1 << 4) + hex2);
    -+  }
    -+  return decoded;
    -+}
    -+
    - template 
    - static void read_retain_symbols_file(Context &ctx, std::string_view path) {
    -   MappedFile *mf = must_open_file(ctx, std::string(path));
    -@@ -863,6 +908,8 @@ std::vector parse_nonpositi
    -     } else if (read_flag("pack-dyn-relocs=none") ||
    -                read_z_flag("nopack-relative-relocs")) {
    -       ctx.arg.pack_dyn_relocs_relr = false;
    -+    } else if (read_arg("encoded-package-metadata")) {
    -+      ctx.arg.package_metadata = parse_percent_encoded_string(ctx, "encoded-package-metadata", arg);
    -     } else if (read_arg("package-metadata")) {
    -       ctx.arg.package_metadata = arg;
    -     } else if (read_flag("stats")) {
    ---- a/test/elf/package-metadata.sh
    -+++ b/test/elf/package-metadata.sh
    -@@ -10,3 +10,6 @@ EOF
    - 
    - $CC -B. -o $t/exe $t/a.o -Wl,-package-metadata='{"foo":"bar"}'
    - readelf -x .note.package $t/exe | grep -Fq '{"foo":"bar"}'
    -+
    -+$CC -B. -o $t/exe2 $t/a.o -Wl,--encoded-package-metadata=%7B%22foo%22%3A%22bar%22%7D
    -+readelf -x .note.package $t/exe2 | grep -Fq '{"foo":"bar"}'
    diff --git a/debian/patches/env-package-metadata.diff b/debian/patches/env-package-metadata.diff
    deleted file mode 100644
    index d88f614a..00000000
    --- a/debian/patches/env-package-metadata.diff
    +++ /dev/null
    @@ -1,17 +0,0 @@
    ---- a/elf/cmdline.cc
    -+++ b/elf/cmdline.cc
    -@@ -1480,6 +1480,14 @@ std::vector parse_nonpositi
    -       ctx.arg.dependency_file = ctx.arg.chroot + "/" + ctx.arg.dependency_file;
    -   }
    - 
    -+  // No package-metadata option, fall-back to env ELF_PACKAGE_METADATA
    -+  if (ctx.arg.package_metadata.empty()) {
    -+    const char* env_package_metadata = getenv("ELF_PACKAGE_METADATA");
    -+    if (env_package_metadata && strcmp(env_package_metadata, "") != 0) {
    -+      ctx.arg.package_metadata = std::string(env_package_metadata);
    -+    }
    -+  }
    -+
    -   // Mark GC root symbols
    -   for (Symbol *sym : ctx.arg.undefined)
    -     sym->gc_root = true;
    diff --git a/debian/patches/fix-armhf-build.diff b/debian/patches/fix-armhf-build.diff
    deleted file mode 100644
    index 5462028d..00000000
    --- a/debian/patches/fix-armhf-build.diff
    +++ /dev/null
    @@ -1,24 +0,0 @@
    -From baf9ae9038dba56324e08e5df0023225a6067154 Mon Sep 17 00:00:00 2001
    -From: Rui Ueyama 
    -Date: Tue, 16 Jul 2024 11:59:22 +0900
    -Subject: [PATCH] Fix a test on Debian
    -
    -If the default linker doesn't complain, just skip the test.
    -
    -Fixes https://github.com/rui314/mold/issues/1301
    ----
    - test/elf/arm_abs-error.sh | 2 ++
    - 1 file changed, 2 insertions(+)
    -
    -Index: mold/test/elf/arm_abs-error.sh
    -===================================================================
    ---- mold.orig/test/elf/arm_abs-error.sh
    -+++ mold/test/elf/arm_abs-error.sh
    -@@ -12,5 +12,7 @@ extern char foo;
    - int main() { printf("foo=%p\n", &foo); }
    - EOF
    - 
    -+$CC -o $t/exe -pie $t/a.o $t/b.o >& /dev/null && skip
    -+
    - ! $CC -B. -o $t/exe -pie $t/a.o $t/b.o >& $t/log
    - grep -q 'recompile with -fPIC' $t/log
    diff --git a/debian/patches/series b/debian/patches/series
    index c5049b32..8b137891 100644
    --- a/debian/patches/series
    +++ b/debian/patches/series
    @@ -1,3 +1 @@
    -fix-armhf-build.diff
    -encoded-package-metadata.diff
    -env-package-metadata.diff
    +
    
    From 9905ccd36ae7a5affd35bd7386349c6142ddb213 Mon Sep 17 00:00:00 2001
    From: Sylvestre Ledru 
    Date: Wed, 25 Sep 2024 13:11:17 +0200
    Subject: [PATCH 3/4] When no package-metadata option is given, fall-back to
     the envvar ELF_PACKAGE_METADATA.
    
    ---
     debian/changelog                         |  4 ++++
     debian/patches/env-package-metadata.diff | 19 +++++++++++++++++++
     debian/patches/series                    |  2 +-
     3 files changed, 24 insertions(+), 1 deletion(-)
     create mode 100644 debian/patches/env-package-metadata.diff
    
    diff --git a/debian/changelog b/debian/changelog
    index c4a49a19..0432c364 100644
    --- a/debian/changelog
    +++ b/debian/changelog
    @@ -2,6 +2,10 @@ mold (2.34.0+dfsg-1) unstable; urgency=medium
     
       * New upstream release
     
    +  [ Matthias Klose  ]
    +  * When no package-metadata option is given, fall-back to the
    +    envvar ELF_PACKAGE_METADATA.
    +
      -- Sylvestre Ledru   Wed, 25 Sep 2024 12:30:51 +0200
     
     mold (2.32.1+dfsg-2) unstable; urgency=medium
    diff --git a/debian/patches/env-package-metadata.diff b/debian/patches/env-package-metadata.diff
    new file mode 100644
    index 00000000..d1b1b590
    --- /dev/null
    +++ b/debian/patches/env-package-metadata.diff
    @@ -0,0 +1,19 @@
    +Index: mold/src/cmdline.cc
    +===================================================================
    +--- mold.orig/src/cmdline.cc
    ++++ mold/src/cmdline.cc
    +@@ -1506,6 +1506,14 @@ std::vector parse_nonpositi
    +       ctx.arg.dependency_file = ctx.arg.chroot + "/" + ctx.arg.dependency_file;
    +   }
    + 
    ++  // No package-metadata option, fall-back to env ELF_PACKAGE_METADATA
    ++  if (ctx.arg.package_metadata.empty()) {
    ++    const char* env_package_metadata = getenv("ELF_PACKAGE_METADATA");
    ++    if (env_package_metadata && strcmp(env_package_metadata, "") != 0) {
    ++      ctx.arg.package_metadata = std::string(env_package_metadata);
    ++    }
    ++  }
    ++
    +   // Mark GC root symbols
    +   for (Symbol *sym : ctx.arg.undefined)
    +     sym->gc_root = true;
    diff --git a/debian/patches/series b/debian/patches/series
    index 8b137891..3b68d71e 100644
    --- a/debian/patches/series
    +++ b/debian/patches/series
    @@ -1 +1 @@
    -
    +env-package-metadata.diff
    
    From f20a2230b39a0654ee5921e32c71575bf10bc339 Mon Sep 17 00:00:00 2001
    From: Sylvestre Ledru 
    Date: Wed, 25 Sep 2024 13:17:56 +0200
    Subject: [PATCH 4/4] Fix two missing-license-paragraph-in-dep5-copyright
     warnings
    
    ---
     debian/changelog |  1 +
     debian/copyright | 32 +++++++++++++++++++++++++++++++-
     2 files changed, 32 insertions(+), 1 deletion(-)
    
    diff --git a/debian/changelog b/debian/changelog
    index 0432c364..7e7ae693 100644
    --- a/debian/changelog
    +++ b/debian/changelog
    @@ -1,6 +1,7 @@
     mold (2.34.0+dfsg-1) unstable; urgency=medium
     
       * New upstream release
    +  * Fix two missing-license-paragraph-in-dep5-copyright warnings
     
       [ Matthias Klose  ]
       * When no package-metadata option is given, fall-back to the
    diff --git a/debian/copyright b/debian/copyright
    index 54ec87d2..3c0b7c1e 100644
    --- a/debian/copyright
    +++ b/debian/copyright
    @@ -9,7 +9,7 @@ Files-Excluded: third-party/mimalloc/bin/mimalloc-redirect.dll
                     third-party/zlib/contrib/dotzlib
     
     Files: *
    -Copyright: 2020-2021 Rui Ueyama 
    +Copyright: 2020-2024 Rui Ueyama 
     License: MIT
     
     Files: third-party/tbb/*
    @@ -111,3 +111,33 @@ License: MIT
      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
      SOFTWARE.
    +
    +License: BSD-3-Clause
    + Redistribution and use in source and binary forms, with or without
    + modification, are permitted provided that the following conditions are met:
    + .
    + 1. Redistributions of source code must retain the above copyright notice, this
    + list of conditions and the following disclaimer.
    + .
    + 2. Redistributions in binary form must reproduce the above copyright notice,
    + this list of conditions and the following disclaimer in the documentation
    + and/or other materials provided with the distribution.
    + .
    + 3. Neither the name of the copyright holder nor the names of its contributors
    + may be used to endorse or promote products derived from this software without
    + specific prior written permission.
    + .
    + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
    + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
    + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
    + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
    + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
    + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    +
    +License: GPL-2+
    + On Debian systems, the full text of the GNU General Public License
    + version 2 can be found in the file '/usr/share/common-licenses/GPL-2'.