diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs
index 6c8782e9cc55..fbd7dad611ca 100644
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -179,3 +179,106 @@ fn isa_constructor(
     let backend = X64Backend::new_with_flags(triple, shared_flags, isa_flags);
     Box::new(backend)
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::types::*;
+    use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature};
+    use crate::isa::CallConv;
+    use crate::settings;
+    use crate::settings::Configurable;
+    use core::str::FromStr;
+    use target_lexicon::Triple;
+
+    /// We have to test cold blocks by observing final machine code,
+    /// rather than VCode, because the VCode orders blocks in lowering
+    /// order, not emission order. (The exact difference between the
+    /// two is that cold blocks are sunk in the latter.) We might as
+    /// well do the test here, where we have a backend to use.
+    #[test]
+    fn test_cold_blocks() {
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        sig.returns.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+
+        let bb0 = func.dfg.make_block();
+        let arg0 = func.dfg.append_block_param(bb0, I32);
+        let bb1 = func.dfg.make_block();
+        let bb2 = func.dfg.make_block();
+        let bb3 = func.dfg.make_block();
+        let bb1_param = func.dfg.append_block_param(bb1, I32);
+        let bb3_param = func.dfg.append_block_param(bb3, I32);
+
+        let mut pos = FuncCursor::new(&mut func);
+
+        pos.insert_block(bb0);
+        let v0 = pos.ins().iconst(I32, 0x1234);
+        let v1 = pos.ins().iadd(arg0, v0);
+        pos.ins().brnz(v1, bb1, &[v1]);
+        pos.ins().jump(bb2, &[]);
+
+        pos.insert_block(bb1);
+        let v2 = pos.ins().isub(v1, v0);
+        let v3 = pos.ins().iadd(v2, bb1_param);
+        pos.ins().brnz(v1, bb2, &[]);
+        pos.ins().jump(bb3, &[v3]);
+
+        pos.func.layout.set_cold(bb2);
+        pos.insert_block(bb2);
+        let v4 = pos.ins().iadd(v1, v0);
+        pos.ins().brnz(v4, bb2, &[]);
+        pos.ins().jump(bb1, &[v4]);
+
+        pos.insert_block(bb3);
+        pos.ins().return_(&[bb3_param]);
+
+        let mut shared_flags_builder = settings::builder();
+        shared_flags_builder.set("opt_level", "none").unwrap();
+        shared_flags_builder.set("enable_verifier", "true").unwrap();
+        let shared_flags = settings::Flags::new(shared_flags_builder);
+        let isa_flags = x64_settings::Flags::new(&shared_flags, x64_settings::builder());
+        let backend = X64Backend::new_with_flags(
+            Triple::from_str("x86_64").unwrap(),
+            shared_flags,
+            isa_flags,
+        );
+        let result = backend
+            .compile_function(&mut func, /* want_disasm = */ false)
+            .unwrap();
+        let code = result.buffer.data();
+
+        // 00000000  55                push rbp
+        // 00000001  4889E5            mov rbp,rsp
+        // 00000004  4889FE            mov rsi,rdi
+        // 00000007  81C634120000      add esi,0x1234
+        // 0000000D  85F6              test esi,esi
+        // 0000000F  0F841B000000      jz near 0x30
+        // 00000015  4889F7            mov rdi,rsi
+        // 00000018  4889F0            mov rax,rsi
+        // 0000001B  81E834120000      sub eax,0x1234
+        // 00000021  01F8              add eax,edi
+        // 00000023  85F6              test esi,esi
+        // 00000025  0F8505000000      jnz near 0x30
+        // 0000002B  4889EC            mov rsp,rbp
+        // 0000002E  5D                pop rbp
+        // 0000002F  C3                ret
+        // 00000030  4889F7            mov rdi,rsi    <--- cold block
+        // 00000033  81C734120000      add edi,0x1234
+        // 00000039  85FF              test edi,edi
+        // 0000003B  0F85EFFFFFFF      jnz near 0x30
+        // 00000041  E9D2FFFFFF        jmp 0x18
+
+        let golden = vec![
+            85, 72, 137, 229, 72, 137, 254, 129, 198, 52, 18, 0, 0, 133, 246, 15, 132, 27, 0, 0, 0,
+            72, 137, 247, 72, 137, 240, 129, 232, 52, 18, 0, 0, 1, 248, 133, 246, 15, 133, 5, 0, 0,
+            0, 72, 137, 236, 93, 195, 72, 137, 247, 129, 199, 52, 18, 0, 0, 133, 255, 15, 133, 239,
+            255, 255, 255, 233, 210, 255, 255, 255,
+        ];
+
+        assert_eq!(code, &golden[..]);
+    }
+}
diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs
index 3cdf1228ca1f..5c805c4029a8 100644
--- a/cranelift/codegen/src/machinst/blockorder.rs
+++ b/cranelift/codegen/src/machinst/blockorder.rs
@@ -99,6 +99,13 @@ pub struct BlockLoweringOrder {
     /// blocks.
     #[allow(dead_code)]
     orig_map: SecondaryMap<Block, Option<BlockIndex>>,
+    /// Cold blocks. These blocks are not reordered in the
+    /// `lowered_order` above; the lowered order must respect RPO
+    /// (uses after defs) in order for lowering to be
+    /// correct. Instead, this set is used to provide `is_cold()`,
+    /// which is used by VCode emission to sink the blocks at the last
+    /// moment (when we actually emit bytes into the MachBuffer).
+    cold_blocks: FxHashSet<BlockIndex>,
 }
 
 /// The origin of a block in the lowered block-order: either an original CLIF
@@ -378,31 +385,28 @@ impl BlockLoweringOrder {
 
         postorder.reverse();
         let mut rpo = postorder;
-
-        // Step 3: sink any cold blocks to the end of the
-        // function. Put the "deferred last" block truly at the end;
-        // this is a correctness requirement (for fallthrough
-        // returns).
-        rpo.sort_by_key(|block| {
-            block
-                .0
-                .orig_block()
-                .map(|block| f.layout.is_cold(block))
-                .unwrap_or(false)
-        });
-
         if let Some(d) = deferred_last {
             rpo.push(d);
         }
 
-        // Step 4: now that we have RPO, build the BlockIndex/BB fwd/rev maps.
+        // Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps.
         let mut lowered_order = vec![];
+        let mut cold_blocks = FxHashSet::default();
         let mut lowered_succ_ranges = vec![];
         let mut lb_to_bindex = FxHashMap::default();
         for (block, succ_range) in rpo.into_iter() {
-            lb_to_bindex.insert(block, lowered_order.len() as BlockIndex);
+            let index = lowered_order.len() as BlockIndex;
+            lb_to_bindex.insert(block, index);
             lowered_order.push(block);
             lowered_succ_ranges.push(succ_range);
+
+            if block
+                .orig_block()
+                .map(|b| f.layout.is_cold(b))
+                .unwrap_or(false)
+            {
+                cold_blocks.insert(index);
+            }
         }
 
         let lowered_succ_indices = lowered_succs
@@ -424,6 +428,7 @@ impl BlockLoweringOrder {
             lowered_succ_indices,
             lowered_succ_ranges,
             orig_map,
+            cold_blocks,
         };
         log::trace!("BlockLoweringOrder: {:?}", result);
         result
@@ -439,6 +444,11 @@ impl BlockLoweringOrder {
         let range = self.lowered_succ_ranges[block as usize];
         &self.lowered_succ_indices[range.0..range.1]
     }
+
+    /// Determine whether the given lowered-block index is cold.
+    pub fn is_cold(&self, block: BlockIndex) -> bool {
+        self.cold_blocks.contains(&block)
+    }
 }
 
 #[cfg(test)]
diff --git a/cranelift/codegen/src/machinst/vcode.rs b/cranelift/codegen/src/machinst/vcode.rs
index 0e8300f49d58..3043e193472b 100644
--- a/cranelift/codegen/src/machinst/vcode.rs
+++ b/cranelift/codegen/src/machinst/vcode.rs
@@ -470,11 +470,24 @@ impl<I: VCodeInst> VCode<I> {
         let mut inst_ends = vec![0; self.insts.len()];
         let mut label_insn_iix = vec![0; self.num_blocks()];
 
+        // Construct the final order we emit code in: cold blocks at the end.
+        let mut final_order: SmallVec<[BlockIndex; 16]> = smallvec![];
+        let mut cold_blocks: SmallVec<[BlockIndex; 16]> = smallvec![];
+        for block in 0..self.num_blocks() {
+            let block = block as BlockIndex;
+            if self.block_order.is_cold(block) {
+                cold_blocks.push(block);
+            } else {
+                final_order.push(block);
+            }
+        }
+        final_order.extend(cold_blocks);
+
+        // Emit blocks.
         let mut safepoint_idx = 0;
         let mut cur_srcloc = None;
         let mut last_offset = None;
-        for block in 0..self.num_blocks() {
-            let block = block as BlockIndex;
+        for block in final_order {
             let new_offset = I::align_basic_block(buffer.cur_offset());
             while new_offset > buffer.cur_offset() {
                 // Pad with NOPs up to the aligned block offset.