diff --git a/Cargo.toml b/Cargo.toml
index ff2a6e18..9709a7b6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,7 @@ libc = "0.2.14"
 optional = true
 version = "1.0.0"
 
-[dev-dependencies]
+[target.'cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))'.dev-dependencies]
 simd = "0.1"
 
 [features]
diff --git a/README.md b/README.md
index 62d473b5..f0376c1a 100644
--- a/README.md
+++ b/README.md
@@ -118,7 +118,7 @@ there should be at least 8 KiB of free stack space, or panicking will result in
 
 ## Limitations
 
-The architectures currently supported are: x86, x86_64, aarch64, or1k.
+The architectures currently supported are: x86, x86_64, aarch64, arm, or1k.
 
 The platforms currently supported are: bare metal, Linux (any libc),
 FreeBSD, DragonFly BSD, macOS.
@@ -176,13 +176,15 @@ of callee-saved registers.
 
 ### Call stack splicing
 
-Non-Windows platforms use [DWARF][] for both stack unwinding and debugging. DWARF call frame
-information is very generic to be ABI-agnostic—it defines a bytecode that describes the actions
-that need to be performed to simulate returning from a function. libfringe uses this bytecode
-to specify that, after the generator function has returned, execution continues at the point
-where the generator function was resumed the last time.
+Non-Windows platforms use [DWARF][] (or the highly similar [ARM EHABI][ehabi]) for both stack
+unwinding and debugging. DWARF call frame information is very generic to be ABI-agnostic—
+it defines a bytecode that describes the actions that need to be performed to simulate
+returning from a function. libfringe uses this bytecode to specify that, after the generator
+function has returned, execution continues at the point where the generator function was
+resumed the last time.
 
 [dwarf]: http://dwarfstd.org
+[ehabi]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0038b/IHI0038B_ehabi.pdf
 
 ## Windows compatibility
 
diff --git a/src/arch/arm.rs b/src/arch/arm.rs
new file mode 100644
index 00000000..18b0c94c
--- /dev/null
+++ b/src/arch/arm.rs
@@ -0,0 +1,304 @@
+// This file is part of libfringe, a low-level green threading library.
+// Copyright (c) Nathan Zadoks <nathan@nathan7.eu>,
+//               whitequark <whitequark@whitequark.org>
+//               Amanieu d'Antras <amanieu@gmail.com>
+// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
+// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
+// http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+// To understand the machine code in this file, keep in mind these facts:
+// * ARM AAPCS ABI passes the first argument in r0. We also use r0 to pass a value
+//   while swapping context; this is an arbitrary choice
+//   (we clobber all registers and could use any of them) but this allows us
+//   to reuse the swap function to perform the initial call.
+//
+// To understand the ARM EHABI CFI code in this file, keep in mind these facts:
+// * CFI is "call frame information"; a set of instructions to a debugger or
+//   an unwinder that allow it to simulate returning from functions. This implies
+//   restoring every register to its pre-call state, as well as the stack pointer.
+// * CFA is "call frame address"; the value of stack pointer right before the call
+//   instruction in the caller. Everything strictly below CFA (and inclusive until
+//   the next CFA) is the call frame of the callee. This implies that the return
+//   address is the part of callee's call frame.
+// * Logically, ARM EHABI CFI is a table where rows are instruction pointer values and
+//   columns describe where registers are spilled (mostly using expressions that
+//   compute a memory location as CFA+n). A .save pseudoinstruction changes
+//   the state of a column for all IP numerically larger than the one it's placed
+//   after. A .pad or .setfp pseudoinstructions change the CFA value similarly.
+// * Simulating return is as easy as restoring register values from the CFI table
+//   and then setting stack pointer to CFA.
+//
+// A high-level overview of the function of the trampolines is:
+// * The 2nd init trampoline puts a controlled value (written in swap to `new_cfa`)
+//   into r11. This is then used as the CFA for the 1st trampoline.
+// * This controlled value points to the bottom of the stack of the parent context,
+//   which holds the saved r11 and lr from the call to swap().
+// * The 1st init trampoline tells the unwinder to restore r11 and lr
+//   from the stack frame at r11 (in the parent stack), thus continuing
+//   unwinding at the swap call site instead of falling off the end of context stack.
+use core::mem;
+use arch::StackPointer;
+use unwind;
+
+pub const STACK_ALIGNMENT: usize = 8;
+
+pub unsafe fn init(stack_base: *mut u8, f: unsafe fn(usize, StackPointer)) -> StackPointer {
+  #[cfg(not(target_vendor = "apple"))]
+  #[naked]
+  unsafe extern "C" fn trampoline_1() {
+    asm!(
+      r#"
+        # gdb has a hardcoded check that rejects backtraces where frame addresses
+        # do not monotonically decrease. It is turned off if the function is called
+        # "__morestack" and that is hardcoded. So, to make gdb backtraces match
+        # the actual unwinder behavior, we call ourselves "__morestack" and mark
+        # the symbol as local; it shouldn't interfere with anything.
+      __morestack:
+      .local __morestack
+
+        # Set up the first part of our ARM EHABI CFI linking stacks together. When
+        # we reach this function from unwinding, r11 will be pointing at the bottom
+        # of the parent linked stack. This link is set each time swap() is called.
+        # When unwinding the frame corresponding to this function, a ARM EHABI unwinder
+        # will use r11+16 as the next call frame address, restore return address (lr)
+        # from CFA-8 and restore r11 from CFA-16. This mirrors what the second half
+        # of `swap_trampoline` does.
+      # .setfp  fp, sp
+      # .save   {fp, lr}
+        .cfi_def_cfa fp, 8
+        .cfi_offset lr, -4
+        .cfi_offset fp, -8
+
+        # This nop is here so that the initial swap doesn't return to the start
+        # of the trampoline, which confuses the unwinder since it will look for
+        # frame information in the previous symbol rather than this one. It is
+        # never actually executed.
+        nop
+
+      .Lend:
+      .size __morestack, .Lend-__morestack
+      "#
+      : : : : "volatile")
+  }
+
+  #[cfg(target_vendor = "apple")]
+  #[naked]
+  unsafe extern "C" fn trampoline_1() {
+    asm!(
+      r#"
+      # Identical to the above, except avoids .local/.size that aren't available on Mach-O.
+      __morestack:
+      .private_extern __morestack
+      # .setfp  fp, sp
+      # .save   {fp, lr}
+        .cfi_def_cfa fp, 8
+        .cfi_offset lr, -4
+        .cfi_offset fp, -8
+        nop
+      "#
+      : : : : "volatile")
+  }
+
+  #[naked]
+  unsafe extern "C" fn trampoline_2() {
+    asm!(
+      r#"
+        # Set up the second part of our ARM EHABI CFI.
+        # When unwinding the frame corresponding to this function, a DWARF unwinder
+        # will restore r11 (and thus CFA of the first trampoline) from the stack slot.
+        # This stack slot is updated every time swap() is called to point to the bottom
+        # of the stack of the context switch just switched from.
+      # .setfp  fp, sp
+      # .save   {fp, lr}
+        .cfi_def_cfa fp, 8
+        .cfi_offset lr, -4
+        .cfi_offset fp, -8
+
+        # This nop is here so that the return address of the swap trampoline
+        # doesn't point to the start of the symbol. This confuses gdb's backtraces,
+        # causing them to think the parent function is trampoline_1 instead of
+        # trampoline_2.
+        nop
+
+        # Call unwind_wrapper with the provided function and the stack base address.
+        add     r2, sp, #16
+        ldr     r3, [sp, #8]
+        bl      ${0}
+
+        # Restore the stack pointer of the parent context. No CFI adjustments
+        # are needed since we have the same stack frame as trampoline_1.
+        ldr     sp, [sp]
+
+        # Load frame and instruction pointers of the parent context.
+        pop     {fp, lr}
+        .cfi_adjust_cfa_offset -8
+        .cfi_restore fp
+        .cfi_restore lr
+
+        # If the returned value is nonzero, trigger an unwind in the parent
+        # context with the given exception object.
+        cmp     r0, #0
+        bne     ${1}
+
+        # Clear the stack pointer. We can't call into this context any more once
+        # the function has returned.
+        mov     r1, #0
+
+        # Return into the new context. Use `r12` instead of `lr` to avoid
+        # return address mispredictions.
+        mov     r12, lr
+        bx      r12
+      "#
+      :
+      : "s" (unwind::unwind_wrapper as usize)
+        "s" (unwind::start_unwind as usize)
+      : : "volatile")
+  }
+
+  // We set up the stack in a somewhat special way so that to the unwinder it
+  // looks like trampoline_1 has called trampoline_2, which has in turn called
+  // swap::trampoline.
+  //
+  // There are 2 call frames in this setup, each containing the return address
+  // followed by the r11 value for that frame. This setup supports unwinding
+  // using DWARF CFI as well as the frame pointer-based unwinding used by tools
+  // such as perf or dtrace.
+  let mut sp = StackPointer::new(stack_base);
+
+  sp.push(0 as usize); // Padding to ensure the stack is properly aligned
+  sp.push(f as usize); // Function that trampoline_2 should call
+
+  // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline
+  // each time a context switch is performed.
+  sp.push(trampoline_1 as usize + 4); // Return after the nop
+  sp.push(0xdead0cfa);                // CFA slot
+
+  // Call frame for swap::trampoline. We set up the r11 value to point to the
+  // parent call frame.
+  let frame = sp.offset(0);
+  sp.push(trampoline_2 as usize + 4); // Entry point, skip initial nop
+  sp.push(frame as usize);            // Pointer to parent call frame
+
+  sp
+}
+
+#[inline(always)]
+pub unsafe fn swap_link(arg: usize, new_sp: StackPointer,
+                        new_stack_base: *mut u8) -> (usize, Option<StackPointer>) {
+  let ret: usize;
+  let ret_sp: usize;
+  asm!(
+    r#"
+        # Set up the link register
+        adr     lr, 0f
+
+        # Save the frame pointer and link register; the unwinder uses them to find
+        # the CFA of the caller, and so they have to have the correct value immediately
+        # after the call instruction that invoked the trampoline.
+        push    {fp, lr}
+
+        # Pass the stack pointer of the old context to the new one.
+        mov     r1, sp
+
+        # Link the call stacks together by writing the current stack bottom
+        # address to the CFA slot in the new stack.
+        str     sp, [r3, #-16]
+
+        # Load stack pointer of the new context.
+        mov     sp, r2
+
+        # Load frame and instruction pointers of the new context.
+        pop     {fp, r12}
+
+        # Return into the new context. Use `r12` instead of `lr` to avoid
+        # return address mispredictions.
+        bx      r12
+
+      0:
+    "#
+    : "={r0}" (ret)
+      "={r1}" (ret_sp)
+    : "{r0}" (arg)
+      "{r2}" (new_sp.0)
+      "{r3}" (new_stack_base)
+    :/*r0,    r1,*/ "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
+      "r8",  "r9",  "r10",/*r11,*/"r12",/*sp,*/ "lr", /*pc,*/
+      "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
+      "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
+      "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+      "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
+      "cc", "memory"
+    : "volatile");
+  (ret, mem::transmute(ret_sp))
+}
+
+#[inline(always)]
+pub unsafe fn swap(arg: usize, new_sp: StackPointer) -> (usize, StackPointer) {
+  // This is identical to swap_link, but without the write to the CFA slot.
+  let ret: usize;
+  let ret_sp: usize;
+  asm!(
+    r#"
+        adr     lr, 0f
+        push    {fp, lr}
+        mov     r1, sp
+        mov     sp, r2
+        pop     {fp, r12}
+        bx      r12
+      0:
+    "#
+    : "={r0}" (ret)
+      "={r1}" (ret_sp)
+    : "{r0}" (arg)
+      "{r2}" (new_sp.0)
+    :/*r0,    r1,*/ "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
+      "r8",  "r9",  "r10",/*r11,*/"r12",/*sp,*/ "lr", /*pc,*/
+      "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
+      "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
+      "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+      "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
+      "cc", "memory"
+      // We need the "alignstack" attribute here to ensure that the stack is
+      // properly aligned if a call to start_unwind needs to be injected into
+      // our stack context.
+    : "volatile", "alignstack");
+  (ret, mem::transmute(ret_sp))
+}
+
+#[inline(always)]
+pub unsafe fn unwind(new_sp: StackPointer, new_stack_base: *mut u8) {
+  // Argument to pass to start_unwind, based on the stack base address.
+  let arg = unwind::unwind_arg(new_stack_base);
+
+  // This is identical to swap_link, except that it performs a tail call to
+  // start_unwind instead of returning into the target context.
+  asm!(
+    r#"
+        adr     lr, 0f
+        push    {fp, lr}
+        str     sp, [r3, #-16]
+        mov     sp, r2
+        pop     {fp, r12}
+
+        # Jump to the start_unwind function, which will force a stack unwind in
+        # the target context. This will eventually return to us through the
+        # stack link.
+        b       ${0}
+
+      0:
+    "#
+    :
+    : "s" (unwind::start_unwind as usize)
+      "{r0}" (arg)
+      "{r2}" (new_sp.0)
+      "{r3}" (new_stack_base)
+    : "r0",  "r1",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
+      "r8",  "r9",  "r10",/*r11,*/"r12",/*sp,*/ "lr", /*pc,*/
+      "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
+      "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
+      "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+      "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
+      "cc", "memory"
+    : "volatile");
+}
diff --git a/src/arch/mod.rs b/src/arch/mod.rs
index 0805d2ae..92513250 100644
--- a/src/arch/mod.rs
+++ b/src/arch/mod.rs
@@ -13,6 +13,7 @@ use core::nonzero::NonZero;
 #[cfg_attr(target_arch = "x86",     path = "x86.rs")]
 #[cfg_attr(target_arch = "x86_64",  path = "x86_64.rs")]
 #[cfg_attr(target_arch = "aarch64", path = "aarch64.rs")]
+#[cfg_attr(target_arch = "arm",     path = "arm.rs")]
 #[cfg_attr(target_arch = "or1k",    path = "or1k.rs")]
 mod imp;
 
@@ -40,6 +41,7 @@ impl StackPointer {
 #[cfg(test)]
 mod tests {
   extern crate test;
+  #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
   extern crate simd;
 
   use arch::{self, StackPointer};
@@ -66,6 +68,7 @@ mod tests {
     }
   }
 
+  #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
   #[test]
   fn context_simd() {
     unsafe fn permuter(arg: usize, stack_ptr: StackPointer) {
diff --git a/src/arch/x86.rs b/src/arch/x86.rs
index bea2b149..86edc402 100644
--- a/src/arch/x86.rs
+++ b/src/arch/x86.rs
@@ -343,7 +343,7 @@ pub unsafe fn unwind(new_sp: StackPointer, new_stack_base: *mut u8) {
 
   asm!(
     r#"
-      call    ${0:c}@plt
+      call    ${0:c}
     "#
     :
     : "s" (trampoline as usize)
diff --git a/src/unwind.rs b/src/unwind.rs
index fc0b3f51..ebe4a30c 100644
--- a/src/unwind.rs
+++ b/src/unwind.rs
@@ -28,7 +28,9 @@ fn have_cross_stack_unwind() -> bool {
   //   for now.
   // - iOS on ARM uses setjmp/longjmp instead of DWARF-2 unwinding, which needs
   //   to be explicitly saved/restored when switching contexts.
-  !(cfg!(windows) || cfg!(all(target_os = "ios", target_arch = "arm")))
+  // - LLVM doesn't currently support ARM EHABI directives in inline assembly so
+  //   we instead need to propagate exceptions manually across contexts.
+  !(cfg!(windows) || cfg!(target_arch = "arm"))
 }
 
 // Wrapper around the root function of a generator which handles unwinding.