very bad codegen for thread_local! on OSX #60141

mtak- · 2019-04-20T17:59:48Z

This sample code demonstrates the problem.
godbolt link (macos on left/top).

#[allow(improper_ctypes)]

type T = String;

thread_local!{
    static X: T = unsafe { init() };
}

extern "C" {
    #[inline(never)]
    #[cold]
    fn init() -> T;
}

pub fn get() -> T {
    X.with(|x| x.clone())
}

asm

example::get:
  pushq %rbp
  movq %rsp, %rbp
  pushq %r14
  pushq %rbx
  subq $64, %rsp
  movq %rdi, %r14
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  cmpb $0, 25(%rax)
  jne LBB3_9
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  cmpb $0, 24(%rax)
  jne LBB3_3
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  movq %rax, %rbx
  leaq std::thread::local::fast::destroy_value(%rip), %rsi
  movq %rax, %rdi
  callq std::sys::unix::fast_thread_local::register_dtor
  movb $1, 24(%rbx)
LBB3_3:
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  cmpq $0, (%rax)
  je LBB3_4
LBB3_7:
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  leaq -40(%rbp), %rdi
  movq %rax, %rsi
  callq <alloc::string::String as core::clone::Clone>::clone
  movq -40(%rbp), %rax
  vmovups -32(%rbp), %xmm0
  vmovaps %xmm0, -80(%rbp)
  testq %rax, %rax
  je LBB3_9
  movq %rax, (%r14)
  vmovaps -80(%rbp), %xmm0
  vmovups %xmm0, 8(%r14)
  movq %r14, %rax
  addq $64, %rsp
  popq %rbx
  popq %r14
  popq %rbp
  retq
LBB3_4:
  leaq -40(%rbp), %rdi
  callq _init
  vmovups -40(%rbp), %xmm0
  vmovaps %xmm0, -64(%rbp)
  movq -24(%rbp), %rcx
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  movq (%rax), %rdi
  movq 8(%rax), %rsi
  vmovaps -64(%rbp), %xmm0
  vmovups %xmm0, (%rax)
  movq %rcx, 16(%rax)
  testq %rdi, %rdi
  je LBB3_7
  testq %rsi, %rsi
  je LBB3_7
  movl $1, %edx
  callq ___rust_dealloc
  jmp LBB3_7
LBB3_9:
  callq core::result::unwrap_failed

The asm demonstrates that even when the value has been initialized, and the destructor registered, but not yet running, the thread local pointer gets looked up (callq *(%rdi)) four times!!! On linux that lookup only occurs once.

One potential fix is to insert a read_volatile inside of __getit().

let key = &__KEY;
// make platform specific version of this
let key = $crate::ptr::read_volatile(&key);
key.get()

Which improves the asm to the following:

asm

__ZN17thread_local_test17thread_local_test3get17h69984f7e963f5a1bE:
	pushq	%rbp
	movq	%rsp, %rbp
	pushq	%r14
	pushq	%rbx
	subq	$48, %rsp
	movq	%rdi, %r14
	movq	__ZN17thread_local_test17thread_local_test3get1X7__getit5__KEY17h1886c2e600469f01E@TLVP(%rip), %rdi
	callq	*(%rdi)
	movq	%rax, -48(%rbp)
	movq	-48(%rbp), %rbx
	cmpb	$0, 25(%rbx)
	jne	LBB10_9
	cmpb	$0, 24(%rbx)
	jne	LBB10_3
	leaq	__ZN3std6thread5local4fast13destroy_value17hbc43def25f86e32eE(%rip), %rsi
	movq	%rbx, %rdi
	callq	__ZN3std3sys4unix17fast_thread_local13register_dtor17ha35ff2a0753ab802E
	movb	$1, 24(%rbx)
LBB10_3:
	cmpq	$0, (%rbx)
	je	LBB10_4
LBB10_7:
	leaq	-48(%rbp), %rdi
	movq	%rbx, %rsi
	callq	__ZN60_$LT$alloc..string..String$u20$as$u20$core..clone..Clone$GT$5clone17h9234dcb674122143E
	movq	-48(%rbp), %rax
	vmovups	-40(%rbp), %xmm0
	vmovaps	%xmm0, -64(%rbp)
	testq	%rax, %rax
	je	LBB10_9
	movq	%rax, (%r14)
	vmovaps	-64(%rbp), %xmm0
	vmovups	%xmm0, 8(%r14)
	addq	$48, %rsp
	popq	%rbx
	popq	%r14
	popq	%rbp
	retq
LBB10_4:
	leaq	-48(%rbp), %rdi
	callq	__ZN17thread_local_test17thread_local_test4init17h2e4a5dfd2802b210E
	vmovaps	-48(%rbp), %xmm0
	movq	-32(%rbp), %rax
	movq	(%rbx), %rdi
	movq	8(%rbx), %rsi
	vmovups	%xmm0, (%rbx)
	movq	%rax, 16(%rbx)
	testq	%rdi, %rdi
	je	LBB10_7
	testq	%rsi, %rsi
	je	LBB10_7
	movl	$1, %edx
	callq	___rust_dealloc
	jmp	LBB10_7
LBB10_9:
	callq	__ZN4core6result13unwrap_failed17h6ad7be40c736aa06E

Benchmark

    #[inline(never)]
    #[cold]
    fn init() -> String {
        String::from("hello world")
    }

    #[bench]
    fn thread_local(b: &mut Bencher) {
        const ITER_COUNT: usize = 1_000_000;
        thread_local! {
            static X: String = init();
        }
        X.with(|_| {});
        b.iter(|| {
            for _ in 0..ITER_COUNT {
                X.with(|x| {
                    test::black_box(x);
                })
            }
        })
    }

Results

current       ... bench:   7,128,388 ns/iter
read_volatile ... bench:   2,085,139 ns/iter (+/- 204,564)

The text was updated successfully, but these errors were encountered:

LifeIsStrange · 2019-04-21T20:36:37Z

It may be interesting to see how much codegen is altered by inlining __getit?
#59720

mtak- · 2019-04-24T17:12:05Z

@LifeIsStrange it has been inlined here.

I would like to fix this, but I'm not sure if the bug is on the rustc side or libstd. Is there anyone who can give some guidance on where to look? If not I'll just submit a platform specific version of the read_volatile trick which feels more like a workaround than a proper fix.

@alexcrichton

macos tlv workaround fixes: #60141 Includes: * remove dead code: `requires_move_before_drop`. This hasn't been needed for a while now (oops I should have removed it in #57655) * redox had a copy of `fast::Key` (not sure why?). That has been removed. * Perform a `read_volatile` on OSX to reduce `tlv_get_addr` calls per `__getit` from (4-2 depending on context) to 1. `tlv_get_addr` is relatively expensive (~1.5ns on my machine). Previously, in contexts where `__getit` was inlined, 4 calls to `tlv_get_addr` were performed per lookup. For some reason when `__getit` is not inlined this is reduced to 2x - and performance improves to match. After this PR, I have only ever seen 1x call to `tlv_get_addr` per `__getit`, and macos now benefits from situations where `__getit` is inlined. I'm not sure if the `read_volatile(&&__KEY)` trick is working around an LLVM bug, or a rustc bug, or neither. r? @alexcrichton

@alexcrichton

macos tlv workaround fixes: #60141 Includes: * remove dead code: `requires_move_before_drop`. This hasn't been needed for a while now (oops I should have removed it in #57655) * redox had a copy of `fast::Key` (not sure why?). That has been removed. * Perform a `read_volatile` on OSX to reduce `tlv_get_addr` calls per `__getit` from (4-2 depending on context) to 1. `tlv_get_addr` is relatively expensive (~1.5ns on my machine). Previously, in contexts where `__getit` was inlined, 4 calls to `tlv_get_addr` were performed per lookup. For some reason when `__getit` is not inlined this is reduced to 2x - and performance improves to match. After this PR, I have only ever seen 1x call to `tlv_get_addr` per `__getit`, and macos now benefits from situations where `__getit` is inlined. I'm not sure if the `read_volatile(&&__KEY)` trick is working around an LLVM bug, or a rustc bug, or neither. r? @alexcrichton

RalfJung · 2019-06-20T13:32:36Z

This is so odd, why does using a volatile read improve performance?

mtak- · 2019-06-20T18:00:20Z

Some background you probably already know. LLVM and gcc/clang/etc treat volatile loads/stores as something that must be done exactly once - RCU and other techniques depend on this. Therefore, a load has to be memoized (probly in some register) if used multiple times after the read_volatile. The load in this case is a load of the address of a thread local variable. Getting the address of a thread local amounts to a function call _tlv_get_addr. This is slow because it goes through a dylib on macos.

Before the read volatile trick, the read is not memoized because the optimizer mistakenly thinks it is faster to just calculate the address again, as it probly would be for normal non-thread-local variables. This results in a minimum of 2-4 _tlv_get_addr calls per use of a thread local on macos. For whatever reason, llvm optimizes this correctly on linux.

RalfJung · 2019-06-20T18:20:39Z

Before the read volatile trick, the read is not memoized because the optimizer mistakenly thinks it is faster to just calculate the address again, as it probly would be for normal non-thread-local variables.

Ah, that makes a lot of sense. Thanks for the explanation!

jonas-schievink added A-thread-locals Area: Thread local storage (TLS) I-slow Issue: Problems and improvements with respect to performance of generated code. O-macos Operating system: macOS labels Apr 20, 2019

mtak- mentioned this issue Apr 27, 2019

macos tlv workaround #60341

Merged

bors closed this as completed in #60341 Jun 20, 2019

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

very bad codegen for thread_local! on OSX #60141

very bad codegen for thread_local! on OSX #60141

mtak- commented Apr 20, 2019 •

edited

Loading

LifeIsStrange commented Apr 21, 2019

mtak- commented Apr 24, 2019

RalfJung commented Jun 20, 2019 •

edited

Loading

mtak- commented Jun 20, 2019

RalfJung commented Jun 20, 2019

very bad codegen for thread_local! on OSX #60141

very bad codegen for thread_local! on OSX #60141

Comments

mtak- commented Apr 20, 2019 • edited Loading

LifeIsStrange commented Apr 21, 2019

mtak- commented Apr 24, 2019

RalfJung commented Jun 20, 2019 • edited Loading

mtak- commented Jun 20, 2019

RalfJung commented Jun 20, 2019

mtak- commented Apr 20, 2019 •

edited

Loading

RalfJung commented Jun 20, 2019 •

edited

Loading