diff --git a/src/arch/aarch64.rs b/src/arch/aarch64.rs index 6fb7b0ec..4cdf86e9 100644 --- a/src/arch/aarch64.rs +++ b/src/arch/aarch64.rs @@ -47,14 +47,12 @@ // from the stack frame at x29 (in the parent stack), thus continuing // unwinding at the swap call site instead of falling off the end of context stack. use core::mem; -use stack::Stack; +use stack; +use arch::StackPointer; pub const STACK_ALIGNMENT: usize = 16; -#[derive(Debug, Clone, Copy)] -pub struct StackPointer(*mut usize); - -pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -> !) -> StackPointer { +pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer)) -> StackPointer { #[cfg(not(target_vendor = "apple"))] #[naked] unsafe extern "C" fn trampoline_1() { @@ -129,15 +127,29 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) - # Call the provided function. ldr x2, [sp, #16] blr x2 + + # Clear the stack pointer. We can't call into this context any more once + # the function has returned. + mov x1, #0 + + # Restore the stack pointer of the parent context. No CFI adjustments + # are needed since we have the same stack frame as trampoline_1. + ldr x2, [sp] + mov sp, x2 + + # Load frame and instruction pointers of the parent context. + ldp x29, x30, [sp], #16 + .cfi_adjust_cfa_offset -16 + .cfi_restore x29 + .cfi_restore x30 + + # Return into the parent context. Use `br` instead of a `ret` to avoid + # return address mispredictions. + br x30 "# : : : : "volatile") } - unsafe fn push(sp: &mut StackPointer, val: usize) { - sp.0 = sp.0.offset(-1); - *sp.0 = val - } - // We set up the stack in a somewhat special way so that to the unwinder it // looks like trampoline_1 has called trampoline_2, which has in turn called // swap::trampoline. @@ -146,36 +158,30 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) - // followed by the x29 value for that frame. This setup supports unwinding // using DWARF CFI as well as the frame pointer-based unwinding used by tools // such as perf or dtrace. - let mut sp = StackPointer(stack.base() as *mut usize); + let mut sp = StackPointer::stack_base(stack); - push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned - push(&mut sp, f as usize); // Function that trampoline_2 should call + sp.push(0 as usize); // Padding to ensure the stack is properly aligned + sp.push(f as usize); // Function that trampoline_2 should call // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline // each time a context switch is performed. - push(&mut sp, trampoline_1 as usize + 4); // Return after the nop - push(&mut sp, 0xdeaddeaddead0cfa); // CFA slot + sp.push(trampoline_1 as usize + 4); // Return after the nop + sp.push(0xdeaddeaddead0cfa); // CFA slot // Call frame for swap::trampoline. We set up the x29 value to point to the // parent call frame. - let frame = sp; - push(&mut sp, trampoline_2 as usize + 4); // Entry point, skip initial nop - push(&mut sp, frame.0 as usize); // Pointer to parent call frame + let frame = sp.offset(0); + sp.push(trampoline_2 as usize + 4); // Entry point, skip initial nop + sp.push(frame as usize); // Pointer to parent call frame sp } #[inline(always)] -pub unsafe fn swap(arg: usize, new_sp: StackPointer, - new_stack: Option<&Stack>) -> (usize, StackPointer) { +pub unsafe fn swap_link(arg: usize, new_sp: StackPointer, + new_stack: &Stack) -> (usize, Option) { // Address of the topmost CFA stack slot. - let mut dummy: usize = mem::uninitialized(); - let new_cfa = if let Some(new_stack) = new_stack { - (new_stack.base() as *mut usize).offset(-4) - } else { - // Just pass a dummy pointer if we aren't linking the stack - &mut dummy - }; + let new_cfa = StackPointer::stack_base(new_stack).offset(-4); #[naked] unsafe extern "C" fn trampoline() { @@ -213,7 +219,7 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer, } let ret: usize; - let ret_sp: *mut usize; + let ret_sp: usize; asm!( r#" # Call the trampoline to switch to the new context. @@ -240,5 +246,67 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer, // the "alignstack" LLVM inline assembly option does exactly the same // thing on AArch64. : "volatile", "alignstack"); - (ret, StackPointer(ret_sp)) + (ret, mem::transmute(ret_sp)) +} + +#[inline(always)] +pub unsafe fn swap(arg: usize, new_sp: StackPointer) -> (usize, StackPointer) { + #[naked] + unsafe extern "C" fn trampoline() { + asm!( + r#" + # Save the frame pointer and link register; the unwinder uses them to find + # the CFA of the caller, and so they have to have the correct value immediately + # after the call instruction that invoked the trampoline. + stp x29, x30, [sp, #-16]! + .cfi_adjust_cfa_offset 16 + .cfi_rel_offset x30, 8 + .cfi_rel_offset x29, 0 + + # Pass the stack pointer of the old context to the new one. + mov x1, sp + # Load stack pointer of the new context. + mov sp, x2 + + # Load frame and instruction pointers of the new context. + ldp x29, x30, [sp], #16 + .cfi_adjust_cfa_offset -16 + .cfi_restore x29 + .cfi_restore x30 + + # Return into the new context. Use `br` instead of a `ret` to avoid + # return address mispredictions. + br x30 + "# + : : : : "volatile") + } + + let ret: usize; + let ret_sp: usize; + asm!( + r#" + # Call the trampoline to switch to the new context. + bl ${2} + "# + : "={x0}" (ret) + "={x1}" (ret_sp) + : "s" (trampoline as usize) + "{x0}" (arg) + "{x2}" (new_sp.0) + :/*x0, "x1",*/"x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", + "x24", "x25", "x26", "x27", "x28",/*fp,*/ "lr", /*sp,*/ + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "cc", "memory" + // Ideally, we would set the LLVM "noredzone" attribute on this function + // (and it would be propagated to the call site). Unfortunately, rustc + // provides no such functionality. Fortunately, by a lucky coincidence, + // the "alignstack" LLVM inline assembly option does exactly the same + // thing on AArch64. + : "volatile", "alignstack"); + (ret, mem::transmute(ret_sp)) } diff --git a/src/arch/mod.rs b/src/arch/mod.rs index f37314dc..9c9c5bb1 100644 --- a/src/arch/mod.rs +++ b/src/arch/mod.rs @@ -7,6 +7,8 @@ // copied, modified, or distributed except according to those terms. pub use self::imp::*; +use core::nonzero::NonZero; +use stack; #[allow(unused_attributes)] // rust-lang/rust#35584 #[cfg_attr(target_arch = "x86", path = "x86.rs")] @@ -15,6 +17,27 @@ pub use self::imp::*; #[cfg_attr(target_arch = "or1k", path = "or1k.rs")] mod imp; +#[derive(Debug, Clone, Copy)] +pub struct StackPointer(NonZero<*mut usize>); + +impl StackPointer { + #[inline(always)] + pub unsafe fn push(&mut self, val: usize) { + self.0 = NonZero::new(self.0.offset(-1)); + **self.0 = val; + } + + #[inline(always)] + pub unsafe fn stack_base(stack: &Stack) -> StackPointer { + StackPointer(NonZero::new(stack.base() as *mut usize)) + } + + #[inline(always)] + pub unsafe fn offset(&self, count: isize) -> *mut usize { + self.0.offset(count) + } +} + #[cfg(test)] mod tests { extern crate test; @@ -25,11 +48,11 @@ mod tests { #[test] fn context() { - unsafe extern "C" fn adder(arg: usize, stack_ptr: StackPointer) -> ! { + unsafe extern "C" fn adder(arg: usize, stack_ptr: StackPointer) { println!("it's alive! arg: {}", arg); - let (arg, stack_ptr) = arch::swap(arg + 1, stack_ptr, None); + let (arg, stack_ptr) = arch::swap(arg + 1, stack_ptr); println!("still alive! arg: {}", arg); - arch::swap(arg + 1, stack_ptr, None); + arch::swap(arg + 1, stack_ptr); panic!("i should be dead"); } @@ -37,26 +60,26 @@ mod tests { let stack = OsStack::new(4 << 20).unwrap(); let stack_ptr = arch::init(&stack, adder); - let (ret, stack_ptr) = arch::swap(10, stack_ptr, Some(&stack)); + let (ret, stack_ptr) = arch::swap_link(10, stack_ptr, &stack); assert_eq!(ret, 11); - let (ret, _) = arch::swap(50, stack_ptr, Some(&stack)); + let (ret, _) = arch::swap_link(50, stack_ptr.unwrap(), &stack); assert_eq!(ret, 51); } } #[test] fn context_simd() { - unsafe extern "C" fn permuter(arg: usize, stack_ptr: StackPointer) -> ! { + unsafe extern "C" fn permuter(arg: usize, stack_ptr: StackPointer) { // This will crash if the stack is not aligned properly. let x = simd::i32x4::splat(arg as i32); let y = x * x; println!("simd result: {:?}", y); - let (_, stack_ptr) = arch::swap(0, stack_ptr, None); + let (_, stack_ptr) = arch::swap(0, stack_ptr); // And try again after a context switch. let x = simd::i32x4::splat(arg as i32); let y = x * x; println!("simd result: {:?}", y); - arch::swap(0, stack_ptr, None); + arch::swap(0, stack_ptr); panic!("i should be dead"); } @@ -64,16 +87,16 @@ mod tests { let stack = OsStack::new(4 << 20).unwrap(); let stack_ptr = arch::init(&stack, permuter); - let (_, stack_ptr) = arch::swap(10, stack_ptr, Some(&stack)); - arch::swap(20, stack_ptr, Some(&stack)); + let (_, stack_ptr) = arch::swap_link(10, stack_ptr, &stack); + arch::swap_link(20, stack_ptr.unwrap(), &stack); } } - unsafe extern "C" fn do_panic(arg: usize, stack_ptr: StackPointer) -> ! { + unsafe extern "C" fn do_panic(arg: usize, stack_ptr: StackPointer) { match arg { 0 => panic!("arg=0"), 1 => { - arch::swap(0, stack_ptr, None); + arch::swap(0, stack_ptr); panic!("arg=1"); } _ => unreachable!() @@ -87,7 +110,7 @@ mod tests { let stack = OsStack::new(4 << 20).unwrap(); let stack_ptr = arch::init(&stack, do_panic); - arch::swap(0, stack_ptr, Some(&stack)); + arch::swap_link(0, stack_ptr, &stack); } } @@ -98,18 +121,31 @@ mod tests { let stack = OsStack::new(4 << 20).unwrap(); let stack_ptr = arch::init(&stack, do_panic); - let (_, stack_ptr) = arch::swap(1, stack_ptr, Some(&stack)); - arch::swap(0, stack_ptr, Some(&stack)); + let (_, stack_ptr) = arch::swap_link(1, stack_ptr, &stack); + arch::swap_link(0, stack_ptr.unwrap(), &stack); + } + } + + #[test] + fn ret() { + unsafe extern "C" fn ret2(_: usize, _: StackPointer) {} + + unsafe { + let stack = OsStack::new(4 << 20).unwrap(); + let stack_ptr = arch::init(&stack, ret2); + + let (_, stack_ptr) = arch::swap_link(0, stack_ptr, &stack); + assert!(stack_ptr.is_none()); } } #[bench] fn swap(b: &mut test::Bencher) { - unsafe extern "C" fn loopback(mut arg: usize, mut stack_ptr: StackPointer) -> ! { + unsafe extern "C" fn loopback(mut arg: usize, mut stack_ptr: StackPointer) { // This deliberately does not ignore arg, to measure the time it takes // to move the return value between registers. loop { - let data = arch::swap(arg, stack_ptr, None); + let data = arch::swap(arg, stack_ptr); arg = data.0; stack_ptr = data.1; } @@ -120,7 +156,7 @@ mod tests { let mut stack_ptr = arch::init(&stack, loopback); b.iter(|| for _ in 0..10 { - stack_ptr = arch::swap(0, stack_ptr, Some(&stack)).1; + stack_ptr = arch::swap_link(0, stack_ptr, &stack).1.unwrap(); }); } } diff --git a/src/arch/or1k.rs b/src/arch/or1k.rs index dd1792a5..cf99405a 100644 --- a/src/arch/or1k.rs +++ b/src/arch/or1k.rs @@ -42,14 +42,12 @@ // from the stack frame at r2 (in the parent stack), thus continuing // unwinding at the swap call site instead of falling off the end of context stack. use core::mem; -use stack::Stack; +use stack; +use arch::StackPointer; pub const STACK_ALIGNMENT: usize = 4; -#[derive(Debug, Clone, Copy)] -pub struct StackPointer(*mut usize); - -pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -> !) -> StackPointer { +pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer)) -> StackPointer { #[naked] unsafe extern "C" fn trampoline_1() { asm!( @@ -108,15 +106,26 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) - l.lwz r5, 8(r1) l.jalr r5 l.nop + + # Clear the stack pointer. We can't call into this context any more once + # the function has returned. + l.or r4, r0, r0 + + # Restore the stack pointer of the parent context. No CFI adjustments + # are needed since we have the same stack frame as trampoline_1. + l.lwz r1, 0(r1) + + # Load frame and instruction pointers of the parent context. + l.lwz r2, -4(r1) + l.lwz r9, -8(r1) + + # Return into the parent context. + l.jr r9 + l.nop "# : : : : "volatile") } - unsafe fn push(sp: &mut StackPointer, val: usize) { - sp.0 = sp.0.offset(-1); - *sp.0 = val - } - // We set up the stack in a somewhat special way so that to the unwinder it // looks like trampoline_1 has called trampoline_2, which has in turn called // swap::trampoline. @@ -125,20 +134,20 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) - // followed by the r2 value for that frame. This setup supports unwinding // using DWARF CFI as well as the frame pointer-based unwinding used by tools // such as perf or dtrace. - let mut sp = StackPointer(stack.base() as *mut usize); + let mut sp = StackPointer::stack_base(stack); - push(&mut sp, f as usize); // Function that trampoline_2 should call + sp.push(f as usize); // Function that trampoline_2 should call // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline // each time a context switch is performed. - push(&mut sp, 0xdead0cfa); // CFA slot - push(&mut sp, trampoline_1 as usize + 4); // Return after the nop + sp.push(0xdead0cfa); // CFA slot + sp.push(trampoline_1 as usize + 4); // Return after the nop // Call frame for swap::trampoline. We set up the r2 value to point to the // parent call frame. let frame = sp; - push(&mut sp, frame.0 as usize); // Pointer to parent call frame - push(&mut sp, trampoline_2 as usize + 4); // Entry point, skip initial nop + sp.push(frame.offset(0) as usize); // Pointer to parent call frame + sp.push(trampoline_2 as usize + 4); // Entry point, skip initial nop // The last two values are read by the swap trampoline and are actually in the // red zone and not below the stack pointer. @@ -146,16 +155,10 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) - } #[inline(always)] -pub unsafe fn swap(arg: usize, new_sp: StackPointer, - new_stack: Option<&Stack>) -> (usize, StackPointer) { +pub unsafe fn swap_link(arg: usize, new_sp: StackPointer, + new_stack: &Stack) -> (usize, Option) { // Address of the topmost CFA stack slot. - let mut dummy: usize = mem::uninitialized(); - let new_cfa = if let Some(new_stack) = new_stack { - (new_stack.base() as *mut usize).offset(-2) - } else { - // Just pass a dummy pointer if we aren't linking the stack - &mut dummy - }; + let new_cfa = StackPointer::stack_base(new_stack).offset(-2); #[naked] unsafe extern "C" fn trampoline() { @@ -179,7 +182,6 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer, # Load stack pointer of the new context. l.or r1, r0, r5 - # Restore frame pointer and link register of the new context. # Load frame and instruction pointers of the new context. l.lwz r2, -4(r1) l.lwz r9, -8(r1) @@ -192,7 +194,7 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer, } let ret: usize; - let ret_sp: *mut usize; + let ret_sp: usize; asm!( r#" # Call the trampoline to switch to the new context. @@ -211,5 +213,57 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer, "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31", "cc", "memory" : "volatile"); - (ret, StackPointer(ret_sp)) + (ret, mem::transmute(ret_sp)) +} + +#[inline(always)] +pub unsafe fn swap(arg: usize, new_sp: StackPointer) -> (usize, StackPointer) { + #[naked] + unsafe extern "C" fn trampoline() { + asm!( + r#" + # Save the frame pointer and link register; the unwinder uses them to find + # the CFA of the caller, and so they have to have the correct value immediately + # after the call instruction that invoked the trampoline. + l.sw -4(r1), r2 + l.sw -8(r1), r9 + .cfi_offset r2, -4 + .cfi_offset r9, -8 + + # Pass the stack pointer of the old context to the new one. + l.or r4, r0, r1 + # Load stack pointer of the new context. + l.or r1, r0, r5 + + # Load frame and instruction pointers of the new context. + l.lwz r2, -4(r1) + l.lwz r9, -8(r1) + + # Return into the new context. + l.jr r9 + l.nop + "# + : : : : "volatile") + } + + let ret: usize; + let ret_sp: usize; + asm!( + r#" + # Call the trampoline to switch to the new context. + l.jal ${2} + l.nop + "# + : "={r3}" (ret) + "={r4}" (ret_sp) + : "s" (trampoline as usize) + "{r3}" (arg) + "{r5}" (new_sp.0) + :/*"r0", "r1", "r2", "r3", "r4",*/"r5", "r6", "r7", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", + "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31", + "cc", "memory" + : "volatile"); + (ret, mem::transmute(ret_sp)) } diff --git a/src/arch/x86.rs b/src/arch/x86.rs index 226d6414..eca32459 100644 --- a/src/arch/x86.rs +++ b/src/arch/x86.rs @@ -42,14 +42,12 @@ // address from the stack frame at %ebp (in the parent stack), thus continuing // unwinding at the swap call site instead of falling off the end of context stack. use core::mem; -use stack::Stack; +use stack; +use arch::StackPointer; pub const STACK_ALIGNMENT: usize = 16; -#[derive(Debug, Clone, Copy)] -pub struct StackPointer(*mut usize); - -pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -> !) -> StackPointer { +pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer)) -> StackPointer { #[cfg(not(target_vendor = "apple"))] #[naked] unsafe extern "C" fn trampoline_1() { @@ -128,16 +126,31 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) - pushl %esi pushl %edi # Call the provided function. - calll *16(%esp) + calll *16(%esp) + + # Clear the stack pointer. We can't call into this context any more once + # the function has returned. + xorl %esi, %esi + + # Restore the stack pointer of the parent context. No CFI adjustments + # are needed since we have the same stack frame as trampoline_1. + movl 8(%esp), %esp + + # Restore frame pointer of the parent context. + popl %ebp + .cfi_adjust_cfa_offset -4 + .cfi_restore %ebp + + # Return into the parent context. Use `pop` and `jmp` instead of a `ret` + # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge). + popl %eax + .cfi_adjust_cfa_offset -4 + .cfi_register %eip, %eax + jmpl *%eax "# : : : : "volatile") } - unsafe fn push(sp: &mut StackPointer, val: usize) { - sp.0 = sp.0.offset(-1); - *sp.0 = val - } - // We set up the stack in a somewhat special way so that to the unwinder it // looks like trampoline_1 has called trampoline_2, which has in turn called // swap::trampoline. @@ -146,38 +159,32 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) - // followed by the %ebp value for that frame. This setup supports unwinding // using DWARF CFI as well as the frame pointer-based unwinding used by tools // such as perf or dtrace. - let mut sp = StackPointer(stack.base() as *mut usize); + let mut sp = StackPointer::stack_base(stack); - push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned - push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned - push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned - push(&mut sp, f as usize); // Function that trampoline_2 should call + sp.push(0 as usize); // Padding to ensure the stack is properly aligned + sp.push(0 as usize); // Padding to ensure the stack is properly aligned + sp.push(0 as usize); // Padding to ensure the stack is properly aligned + sp.push(f as usize); // Function that trampoline_2 should call // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline // each time a context switch is performed. - push(&mut sp, trampoline_1 as usize + 2); // Return after the 2 nops - push(&mut sp, 0xdead0cfa); // CFA slot + sp.push(trampoline_1 as usize + 2); // Return after the 2 nops + sp.push(0xdead0cfa); // CFA slot // Call frame for swap::trampoline. We set up the %ebp value to point to the // parent call frame. - let frame = sp; - push(&mut sp, trampoline_2 as usize + 1); // Entry point, skip initial nop - push(&mut sp, frame.0 as usize); // Pointer to parent call frame + let frame = sp.offset(0); + sp.push(trampoline_2 as usize + 1); // Entry point, skip initial nop + sp.push(frame as usize); // Pointer to parent call frame sp } #[inline(always)] -pub unsafe fn swap(arg: usize, new_sp: StackPointer, - new_stack: Option<&Stack>) -> (usize, StackPointer) { +pub unsafe fn swap_link(arg: usize, new_sp: StackPointer, + new_stack: &Stack) -> (usize, Option) { // Address of the topmost CFA stack slot. - let mut dummy: usize = mem::uninitialized(); - let new_cfa = if let Some(new_stack) = new_stack { - (new_stack.base() as *mut usize).offset(-6) - } else { - // Just pass a dummy pointer if we aren't linking the stack - &mut dummy - }; + let new_cfa = StackPointer::stack_base(new_stack).offset(-6); #[naked] unsafe extern "C" fn trampoline() { @@ -215,7 +222,7 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer, } let ret: usize; - let ret_sp: *mut usize; + let ret_sp: usize; asm!( r#" # Push instruction pointer of the old context and switch to @@ -233,5 +240,60 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "cc", "dirflag", "fpsr", "flags", "memory" : "volatile"); - (ret, StackPointer(ret_sp)) + (ret, mem::transmute(ret_sp)) +} + +#[inline(always)] +pub unsafe fn swap(arg: usize, new_sp: StackPointer) -> (usize, StackPointer) { + // This is identical to swap_link, but without the write to the CFA slot. + #[naked] + unsafe extern "C" fn trampoline() { + asm!( + r#" + # Save frame pointer explicitly; the unwinder uses it to find CFA of + # the caller, and so it has to have the correct value immediately after + # the call instruction that invoked the trampoline. + pushl %ebp + .cfi_adjust_cfa_offset 4 + .cfi_rel_offset %ebp, 0 + + # Pass the stack pointer of the old context to the new one. + movl %esp, %esi + # Load stack pointer of the new context. + movl %edx, %esp + + # Restore frame pointer of the new context. + popl %ebp + .cfi_adjust_cfa_offset -4 + .cfi_restore %ebp + + # Return into the new context. Use `pop` and `jmp` instead of a `ret` + # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge). + popl %eax + .cfi_adjust_cfa_offset -4 + .cfi_register %eip, %eax + jmpl *%eax + "# + : : : : "volatile") + } + + let ret: usize; + let ret_sp: usize; + asm!( + r#" + # Push instruction pointer of the old context and switch to + # the new context. + call ${2:c} + "# + : "={edi}" (ret) + "={esi}" (ret_sp) + : "s" (trampoline as usize) + "{edi}" (arg) + "{edx}" (new_sp.0) + : "eax", "ebx", "ecx", "edx", /*"esi", "edi", "ebp", "esp",*/ + "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "cc", "dirflag", "fpsr", "flags", "memory" + : "volatile"); + (ret, mem::transmute(ret_sp)) } diff --git a/src/arch/x86_64.rs b/src/arch/x86_64.rs index 11610701..65c79847 100644 --- a/src/arch/x86_64.rs +++ b/src/arch/x86_64.rs @@ -47,14 +47,12 @@ // address from the stack frame at %rbp (in the parent stack), thus continuing // unwinding at the swap call site instead of falling off the end of context stack. use core::mem; -use stack::Stack; +use stack; +use arch::StackPointer; pub const STACK_ALIGNMENT: usize = 16; -#[derive(Debug, Clone, Copy)] -pub struct StackPointer(*mut usize); - -pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -> !) -> StackPointer { +pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer)) -> StackPointer { #[cfg(not(target_vendor = "apple"))] #[naked] unsafe extern "C" fn trampoline_1() { @@ -130,16 +128,31 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) - nop # Call the provided function. - call *16(%rsp) + callq *16(%rsp) + + # Clear the stack pointer. We can't call into this context any more once + # the function has returned. + xorq %rsi, %rsi + + # Restore the stack pointer of the parent context. No CFI adjustments + # are needed since we have the same stack frame as trampoline_1. + movq (%rsp), %rsp + + # Restore frame pointer of the parent context. + popq %rbp + .cfi_adjust_cfa_offset -8 + .cfi_restore %rbp + + # Return into the parent context. Use `pop` and `jmp` instead of a `ret` + # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge). + popq %rax + .cfi_adjust_cfa_offset -8 + .cfi_register %rip, %rax + jmpq *%rax "# : : : : "volatile") } - unsafe fn push(sp: &mut StackPointer, val: usize) { - sp.0 = sp.0.offset(-1); - *sp.0 = val - } - // We set up the stack in a somewhat special way so that to the unwinder it // looks like trampoline_1 has called trampoline_2, which has in turn called // swap::trampoline. @@ -148,36 +161,30 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) - // followed by the %rbp value for that frame. This setup supports unwinding // using DWARF CFI as well as the frame pointer-based unwinding used by tools // such as perf or dtrace. - let mut sp = StackPointer(stack.base() as *mut usize); + let mut sp = StackPointer::stack_base(stack); - push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned - push(&mut sp, f as usize); // Function that trampoline_2 should call + sp.push(0 as usize); // Padding to ensure the stack is properly aligned + sp.push(f as usize); // Function that trampoline_2 should call // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline // each time a context switch is performed. - push(&mut sp, trampoline_1 as usize + 2); // Return after the 2 nops - push(&mut sp, 0xdeaddeaddead0cfa); // CFA slot + sp.push(trampoline_1 as usize + 2); // Return after the 2 nops + sp.push(0xdeaddeaddead0cfa); // CFA slot // Call frame for swap::trampoline. We set up the %rbp value to point to the // parent call frame. - let frame = sp; - push(&mut sp, trampoline_2 as usize + 1); // Entry point, skip initial nop - push(&mut sp, frame.0 as usize); // Pointer to parent call frame + let frame = sp.offset(0); + sp.push(trampoline_2 as usize + 1); // Entry point, skip initial nop + sp.push(frame as usize); // Pointer to parent call frame sp } #[inline(always)] -pub unsafe fn swap(arg: usize, new_sp: StackPointer, - new_stack: Option<&Stack>) -> (usize, StackPointer) { +pub unsafe fn swap_link(arg: usize, new_sp: StackPointer, + new_stack: &Stack) -> (usize, Option) { // Address of the topmost CFA stack slot. - let mut dummy: usize = mem::uninitialized(); - let new_cfa = if let Some(new_stack) = new_stack { - (new_stack.base() as *mut usize).offset(-4) - } else { - // Just pass a dummy pointer if we aren't linking the stack - &mut dummy - }; + let new_cfa = StackPointer::stack_base(new_stack).offset(-4); #[naked] unsafe extern "C" fn trampoline() { @@ -215,7 +222,7 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer, } let ret: usize; - let ret_sp: *mut usize; + let ret_sp: usize; asm!( r#" # Push instruction pointer of the old context and switch to @@ -242,5 +249,69 @@ pub unsafe fn swap(arg: usize, new_sp: StackPointer, // the "alignstack" LLVM inline assembly option does exactly the same // thing on x86_64. : "volatile", "alignstack"); - (ret, StackPointer(ret_sp)) + (ret, mem::transmute(ret_sp)) +} + +#[inline(always)] +pub unsafe fn swap(arg: usize, new_sp: StackPointer) -> (usize, StackPointer) { + // This is identical to swap_link, but without the write to the CFA slot. + #[naked] + unsafe extern "C" fn trampoline() { + asm!( + r#" + # Save frame pointer explicitly; the unwinder uses it to find CFA of + # the caller, and so it has to have the correct value immediately after + # the call instruction that invoked the trampoline. + pushq %rbp + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %rbp, 0 + + # Pass the stack pointer of the old context to the new one. + movq %rsp, %rsi + # Load stack pointer of the new context. + movq %rdx, %rsp + + # Restore frame pointer of the new context. + popq %rbp + .cfi_adjust_cfa_offset -8 + .cfi_restore %rbp + + # Return into the new context. Use `pop` and `jmp` instead of a `ret` + # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge). + popq %rax + .cfi_adjust_cfa_offset -8 + .cfi_register %rip, %rax + jmpq *%rax + "# + : : : : "volatile") + } + + let ret: usize; + let ret_sp: usize; + asm!( + r#" + # Push instruction pointer of the old context and switch to + # the new context. + call ${2:c} + "# + : "={rdi}" (ret) + "={rsi}" (ret_sp) + : "s" (trampoline as usize) + "{rdi}" (arg) + "{rdx}" (new_sp.0) + : "rax", "rbx", "rcx", "rdx", /*"rsi", "rdi", "rbp", "rsp",*/ + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23", + "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31", + "cc", "dirflag", "fpsr", "flags", "memory" + // Ideally, we would set the LLVM "noredzone" attribute on this function + // (and it would be propagated to the call site). Unfortunately, rustc + // provides no such functionality. Fortunately, by a lucky coincidence, + // the "alignstack" LLVM inline assembly option does exactly the same + // thing on x86_64. + : "volatile", "alignstack"); + (ret, mem::transmute(ret_sp)) } diff --git a/src/generator.rs b/src/generator.rs index e1ada623..37101f3f 100644 --- a/src/generator.rs +++ b/src/generator.rs @@ -81,10 +81,9 @@ pub enum State { /// ``` #[derive(Debug)] pub struct Generator<'a, Input: Send + 'a, Output: Send + 'a, Stack: stack::Stack> { - state: State, stack: Stack, stack_id: debug::StackId, - stack_ptr: arch::StackPointer, + stack_ptr: Option, phantom: PhantomData<(&'a (), *mut Input, *const Output)> } @@ -111,30 +110,27 @@ impl<'a, Input, Output, Stack> Generator<'a, Input, Output, Stack> /// See also the [contract](../trait.Stack.html) that needs to be fulfilled by `stack`. pub unsafe fn unsafe_new(stack: Stack, f: F) -> Generator<'a, Input, Output, Stack> where F: FnOnce(&Yielder, Input) + Send + 'a { - unsafe extern "C" fn generator_wrapper(env: usize, stack_ptr: StackPointer) -> ! + unsafe extern "C" fn generator_wrapper(env: usize, stack_ptr: StackPointer) where Input: Send, Output: Send, Stack: stack::Stack, F: FnOnce(&Yielder, Input) { // Retrieve our environment from the callee and return control to it. let f = ptr::read(env as *const F); - let (data, stack_ptr) = arch::swap(0, stack_ptr, None); + let (data, stack_ptr) = arch::swap(0, stack_ptr); // See the second half of Yielder::suspend_bare. let input = ptr::read(data as *const Input); // Run the body of the generator. let yielder = Yielder::new(stack_ptr); f(&yielder, input); - // Past this point, the generator has dropped everything it has held. - loop { yielder.suspend_bare(None); } } let stack_id = debug::StackId::register(&stack); let stack_ptr = arch::init(&stack, generator_wrapper::); // Transfer environment to the callee. - let stack_ptr = arch::swap(&f as *const F as usize, stack_ptr, Some(&stack)).1; + let stack_ptr = arch::swap_link(&f as *const F as usize, stack_ptr, &stack).1; mem::forget(f); Generator { - state: State::Runnable, stack: stack, stack_id: stack_id, stack_ptr: stack_ptr, @@ -146,40 +142,39 @@ impl<'a, Input, Output, Stack> Generator<'a, Input, Output, Stack> /// If the generator function has returned, returns `None`. #[inline] pub fn resume(&mut self, input: Input) -> Option { - match self.state { - State::Runnable => { - // Set the state to Unavailable. Since we have exclusive access to the generator, - // the only case where this matters is the generator function panics, after which - // it must not be invocable again. - self.state = State::Unavailable; - - // Switch to the generator function, and retrieve the yielded value. - let val = unsafe { - let (data_out, stack_ptr) = arch::swap(&input as *const Input as usize, self.stack_ptr, Some(&self.stack)); - self.stack_ptr = stack_ptr; - mem::forget(input); - ptr::read(data_out as *const Option) - }; - - // Unless the generator function has returned, it can be switched to again, so - // set the state to Runnable. - if val.is_some() { self.state = State::Runnable } - - val + // Return None if we have no stack pointer (generator function already returned). + self.stack_ptr.and_then(|stack_ptr| { + // Set the state to Unavailable. Since we have exclusive access to the generator, + // the only case where this matters is the generator function panics, after which + // it must not be invocable again. + self.stack_ptr = None; + + // Switch to the generator function, and retrieve the yielded value. + unsafe { + let (data_out, stack_ptr) = arch::swap_link(&input as *const Input as usize, stack_ptr, &self.stack); + self.stack_ptr = stack_ptr; + mem::forget(input); + + // If the generator function has finished, return None. + match stack_ptr { + Some(_) => Some(ptr::read(data_out as *const Output)), + None => None, + } } - State::Unavailable => None - } + }) } /// Returns the state of the generator. #[inline] - pub fn state(&self) -> State { self.state } + pub fn state(&self) -> State { + if self.stack_ptr.is_some() { State::Runnable } else { State::Unavailable } + } /// Extracts the stack from a generator when the generator function has returned. /// If the generator function has not returned /// (i.e. `self.state() == State::Runnable`), panics. pub fn unwrap(self) -> Stack { - match self.state { + match self.state() { State::Runnable => panic!("Argh! Bastard! Don't touch that!"), State::Unavailable => self.stack } @@ -203,22 +198,17 @@ impl Yielder } } + /// Suspends the generator and returns `Some(item)` from the `resume()` + /// invocation that resumed the generator. #[inline(always)] - fn suspend_bare(&self, val: Option) -> Input { + pub fn suspend(&self, item: Output) -> Input { unsafe { - let (data, stack_ptr) = arch::swap(&val as *const Option as usize, self.stack_ptr.get(), None); + let (data, stack_ptr) = arch::swap(&item as *const Output as usize, self.stack_ptr.get()); + mem::forget(item); self.stack_ptr.set(stack_ptr); - mem::forget(val); ptr::read(data as *const Input) } } - - /// Suspends the generator and returns `Some(item)` from the `resume()` - /// invocation that resumed the generator. - #[inline(always)] - pub fn suspend(&self, item: Output) -> Input { - self.suspend_bare(Some(item)) - } } impl<'a, Output, Stack> Iterator for Generator<'a, (), Output, Stack> diff --git a/src/lib.rs b/src/lib.rs index 0d357226..3db5cfd5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,7 @@ // http://apache.org/licenses/LICENSE-2.0> or the MIT license , at your option. This file may not be // copied, modified, or distributed except according to those terms. -#![feature(asm, naked_functions, cfg_target_vendor)] +#![feature(asm, naked_functions, cfg_target_vendor, nonzero)] #![cfg_attr(feature = "alloc", feature(alloc, heap_api))] #![cfg_attr(test, feature(test))] #![no_std]