Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

very bad codegen for thread_local! on OSX #60141

Closed
mtak- opened this issue Apr 20, 2019 · 5 comments · Fixed by #60341
Closed

very bad codegen for thread_local! on OSX #60141

mtak- opened this issue Apr 20, 2019 · 5 comments · Fixed by #60341
Labels
A-thread-locals Area: Thread local storage (TLS) I-slow Issue: Problems and improvements with respect to performance of generated code. O-macos Operating system: macOS

Comments

@mtak-
Copy link
Contributor

mtak- commented Apr 20, 2019

This sample code demonstrates the problem.
godbolt link (macos on left/top).

#[allow(improper_ctypes)]

type T = String;

thread_local!{
    static X: T = unsafe { init() };
}

extern "C" {
    #[inline(never)]
    #[cold]
    fn init() -> T;
}

pub fn get() -> T {
    X.with(|x| x.clone())
}
asm
example::get:
  pushq %rbp
  movq %rsp, %rbp
  pushq %r14
  pushq %rbx
  subq $64, %rsp
  movq %rdi, %r14
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  cmpb $0, 25(%rax)
  jne LBB3_9
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  cmpb $0, 24(%rax)
  jne LBB3_3
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  movq %rax, %rbx
  leaq std::thread::local::fast::destroy_value(%rip), %rsi
  movq %rax, %rdi
  callq std::sys::unix::fast_thread_local::register_dtor
  movb $1, 24(%rbx)
LBB3_3:
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  cmpq $0, (%rax)
  je LBB3_4
LBB3_7:
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  leaq -40(%rbp), %rdi
  movq %rax, %rsi
  callq <alloc::string::String as core::clone::Clone>::clone
  movq -40(%rbp), %rax
  vmovups -32(%rbp), %xmm0
  vmovaps %xmm0, -80(%rbp)
  testq %rax, %rax
  je LBB3_9
  movq %rax, (%r14)
  vmovaps -80(%rbp), %xmm0
  vmovups %xmm0, 8(%r14)
  movq %r14, %rax
  addq $64, %rsp
  popq %rbx
  popq %r14
  popq %rbp
  retq
LBB3_4:
  leaq -40(%rbp), %rdi
  callq _init
  vmovups -40(%rbp), %xmm0
  vmovaps %xmm0, -64(%rbp)
  movq -24(%rbp), %rcx
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  movq (%rax), %rdi
  movq 8(%rax), %rsi
  vmovaps -64(%rbp), %xmm0
  vmovups %xmm0, (%rax)
  movq %rcx, 16(%rax)
  testq %rdi, %rdi
  je LBB3_7
  testq %rsi, %rsi
  je LBB3_7
  movl $1, %edx
  callq ___rust_dealloc
  jmp LBB3_7
LBB3_9:
  callq core::result::unwrap_failed

The asm demonstrates that even when the value has been initialized, and the destructor registered, but not yet running, the thread local pointer gets looked up (callq *(%rdi)) four times!!! On linux that lookup only occurs once.

One potential fix is to insert a read_volatile inside of __getit().

let key = &__KEY;
// make platform specific version of this
let key = $crate::ptr::read_volatile(&key);
key.get()

Which improves the asm to the following:

asm
__ZN17thread_local_test17thread_local_test3get17h69984f7e963f5a1bE:
	pushq	%rbp
	movq	%rsp, %rbp
	pushq	%r14
	pushq	%rbx
	subq	$48, %rsp
	movq	%rdi, %r14
	movq	__ZN17thread_local_test17thread_local_test3get1X7__getit5__KEY17h1886c2e600469f01E@TLVP(%rip), %rdi
	callq	*(%rdi)
	movq	%rax, -48(%rbp)
	movq	-48(%rbp), %rbx
	cmpb	$0, 25(%rbx)
	jne	LBB10_9
	cmpb	$0, 24(%rbx)
	jne	LBB10_3
	leaq	__ZN3std6thread5local4fast13destroy_value17hbc43def25f86e32eE(%rip), %rsi
	movq	%rbx, %rdi
	callq	__ZN3std3sys4unix17fast_thread_local13register_dtor17ha35ff2a0753ab802E
	movb	$1, 24(%rbx)
LBB10_3:
	cmpq	$0, (%rbx)
	je	LBB10_4
LBB10_7:
	leaq	-48(%rbp), %rdi
	movq	%rbx, %rsi
	callq	__ZN60_$LT$alloc..string..String$u20$as$u20$core..clone..Clone$GT$5clone17h9234dcb674122143E
	movq	-48(%rbp), %rax
	vmovups	-40(%rbp), %xmm0
	vmovaps	%xmm0, -64(%rbp)
	testq	%rax, %rax
	je	LBB10_9
	movq	%rax, (%r14)
	vmovaps	-64(%rbp), %xmm0
	vmovups	%xmm0, 8(%r14)
	addq	$48, %rsp
	popq	%rbx
	popq	%r14
	popq	%rbp
	retq
LBB10_4:
	leaq	-48(%rbp), %rdi
	callq	__ZN17thread_local_test17thread_local_test4init17h2e4a5dfd2802b210E
	vmovaps	-48(%rbp), %xmm0
	movq	-32(%rbp), %rax
	movq	(%rbx), %rdi
	movq	8(%rbx), %rsi
	vmovups	%xmm0, (%rbx)
	movq	%rax, 16(%rbx)
	testq	%rdi, %rdi
	je	LBB10_7
	testq	%rsi, %rsi
	je	LBB10_7
	movl	$1, %edx
	callq	___rust_dealloc
	jmp	LBB10_7
LBB10_9:
	callq	__ZN4core6result13unwrap_failed17h6ad7be40c736aa06E

Benchmark

    #[inline(never)]
    #[cold]
    fn init() -> String {
        String::from("hello world")
    }

    #[bench]
    fn thread_local(b: &mut Bencher) {
        const ITER_COUNT: usize = 1_000_000;
        thread_local! {
            static X: String = init();
        }
        X.with(|_| {});
        b.iter(|| {
            for _ in 0..ITER_COUNT {
                X.with(|x| {
                    test::black_box(x);
                })
            }
        })
    }

Results

current       ... bench:   7,128,388 ns/iter
read_volatile ... bench:   2,085,139 ns/iter (+/- 204,564)
@jonas-schievink jonas-schievink added A-thread-locals Area: Thread local storage (TLS) I-slow Issue: Problems and improvements with respect to performance of generated code. O-macos Operating system: macOS labels Apr 20, 2019
@LifeIsStrange
Copy link

It may be interesting to see how much codegen is altered by inlining __getit?
#59720

@mtak-
Copy link
Contributor Author

mtak- commented Apr 24, 2019

@LifeIsStrange it has been inlined here.

I would like to fix this, but I'm not sure if the bug is on the rustc side or libstd. Is there anyone who can give some guidance on where to look? If not I'll just submit a platform specific version of the read_volatile trick which feels more like a workaround than a proper fix.

bors added a commit that referenced this issue May 16, 2019
macos tlv workaround

fixes: #60141

Includes:
* remove dead code: `requires_move_before_drop`. This hasn't been needed for a while now (oops I should have removed it in #57655)
* redox had a copy of `fast::Key` (not sure why?). That has been removed.
* Perform a `read_volatile` on OSX to reduce `tlv_get_addr` calls per `__getit` from (4-2 depending on context) to 1.

`tlv_get_addr` is relatively expensive (~1.5ns on my machine).

Previously, in contexts where `__getit` was inlined, 4 calls to `tlv_get_addr` were performed per lookup. For some reason when `__getit` is not inlined this is reduced to 2x - and performance improves to match.

After this PR, I have only ever seen 1x call to `tlv_get_addr` per `__getit`, and macos now benefits from situations where `__getit` is inlined.

I'm not sure if the `read_volatile(&&__KEY)` trick is working around an LLVM bug, or a rustc bug, or neither.

r? @alexcrichton
bors added a commit that referenced this issue Jun 20, 2019
macos tlv workaround

fixes: #60141

Includes:
* remove dead code: `requires_move_before_drop`. This hasn't been needed for a while now (oops I should have removed it in #57655)
* redox had a copy of `fast::Key` (not sure why?). That has been removed.
* Perform a `read_volatile` on OSX to reduce `tlv_get_addr` calls per `__getit` from (4-2 depending on context) to 1.

`tlv_get_addr` is relatively expensive (~1.5ns on my machine).

Previously, in contexts where `__getit` was inlined, 4 calls to `tlv_get_addr` were performed per lookup. For some reason when `__getit` is not inlined this is reduced to 2x - and performance improves to match.

After this PR, I have only ever seen 1x call to `tlv_get_addr` per `__getit`, and macos now benefits from situations where `__getit` is inlined.

I'm not sure if the `read_volatile(&&__KEY)` trick is working around an LLVM bug, or a rustc bug, or neither.

r? @alexcrichton
@RalfJung
Copy link
Member

RalfJung commented Jun 20, 2019

This is so odd, why does using a volatile read improve performance?

@mtak-
Copy link
Contributor Author

mtak- commented Jun 20, 2019

Some background you probably already know. LLVM and gcc/clang/etc treat volatile loads/stores as something that must be done exactly once - RCU and other techniques depend on this. Therefore, a load has to be memoized (probly in some register) if used multiple times after the read_volatile. The load in this case is a load of the address of a thread local variable. Getting the address of a thread local amounts to a function call _tlv_get_addr. This is slow because it goes through a dylib on macos.

Before the read volatile trick, the read is not memoized because the optimizer mistakenly thinks it is faster to just calculate the address again, as it probly would be for normal non-thread-local variables. This results in a minimum of 2-4 _tlv_get_addr calls per use of a thread local on macos. For whatever reason, llvm optimizes this correctly on linux.

@RalfJung
Copy link
Member

Before the read volatile trick, the read is not memoized because the optimizer mistakenly thinks it is faster to just calculate the address again, as it probly would be for normal non-thread-local variables.

Ah, that makes a lot of sense. Thanks for the explanation!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
A-thread-locals Area: Thread local storage (TLS) I-slow Issue: Problems and improvements with respect to performance of generated code. O-macos Operating system: macOS
Projects
None yet
Development

Successfully merging a pull request may close this issue.

4 participants