gfx-rs · cwfitzgerald · Feb 29, 2024 · Feb 26, 2024 · Feb 29, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -120,6 +120,7 @@ Bottom level categories:
 - Fix timeout when presenting a surface where no work has been done. By @waywardmonkeys in [#5200](https://github.com/gfx-rs/wgpu/pull/5200)
 - Simplify and speed up the allocation of internal IDs. By @nical in [#5229](https://github.com/gfx-rs/wgpu/pull/5229)
 - Fix an issue where command encoders weren't properly freed if an error occurred during command encoding. By @ErichDonGubler in [#5251](https://github.com/gfx-rs/wgpu/pull/5251).
+- Fix behavior of `extractBits` and `insertBits` when `offset + count` overflows the bit width. By @cwfitzgerald in [#5305](https://github.com/gfx-rs/wgpu/pull/5305)
 - Fix registry leaks with de-duplicated resources. By @nical in [#5244](https://github.com/gfx-rs/wgpu/pull/5244)
 - Fix behavior of integer `clamp` when `min` argument > `max` argument. By @cwfitzgerald in [#5300](https://github.com/gfx-rs/wgpu/pull/5300).
 - Fix missing validation for `Device::clear_buffer` where `offset + size buffer.size` was not checked when `size` was omitted. By @ErichDonGubler in [#5282](https://github.com/gfx-rs/wgpu/pull/5282).

diff --git a/naga/src/back/glsl/mod.rs b/naga/src/back/glsl/mod.rs
@@ -1290,7 +1290,14 @@ impl<'a, W: Write> Writer<'a, W> {
 
             let inner = expr_info.ty.inner_with(&self.module.types);
 
-            if let Expression::Math { fun, arg, arg1, .. } = *expr {
+            if let Expression::Math {
+                fun,
+                arg,
+                arg1,
+                arg2,
+                ..
+            } = *expr
+            {
                 match fun {
                     crate::MathFunction::Dot => {
                         // if the expression is a Dot product with integer arguments,
@@ -1305,6 +1312,14 @@ impl<'a, W: Write> Writer<'a, W> {
                             }
                         }
                     }
+                    crate::MathFunction::ExtractBits => {
+                        // Only argument 1 is re-used.
+                        self.need_bake_expressions.insert(arg1.unwrap());
+                    }
+                    crate::MathFunction::InsertBits => {
+                        // Only argument 2 is re-used.
+                        self.need_bake_expressions.insert(arg2.unwrap());
+                    }
                     crate::MathFunction::CountLeadingZeros => {
                         if let Some(crate::ScalarKind::Sint) = inner.scalar_kind() {
                             self.need_bake_expressions.insert(arg);
@@ -3375,8 +3390,59 @@ impl<'a, W: Write> Writer<'a, W> {
                     }
                     Mf::CountOneBits => "bitCount",
                     Mf::ReverseBits => "bitfieldReverse",
-                    Mf::ExtractBits => "bitfieldExtract",
-                    Mf::InsertBits => "bitfieldInsert",
+                    Mf::ExtractBits => {
+                        // The behavior of ExtractBits is undefined when offset + count > bit_width. We need
+                        // to first sanitize the offset and count first. If we don't do this, AMD and Intel chips
+                        // will return out-of-spec values if the extracted range is not within the bit width.
+                        //
+                        // This encodes the exact formula specified by the wgsl spec, without temporary values:
+                        // https://gpuweb.github.io/gpuweb/wgsl/#extractBits-unsigned-builtin
+                        //
+                        // w = sizeof(x) * 8
+                        // o = min(offset, w)
+                        // c = min(count, w - o)
+                        //
+                        // bitfieldExtract(x, o, c)
+                        //
+                        // extract_bits(e, min(offset, w), min(count, w - min(offset, w))))
+                        let scalar_bits = ctx
+                            .resolve_type(arg, &self.module.types)
+                            .scalar_width()
+                            .unwrap();
+
+                        write!(self.out, "bitfieldExtract(")?;
+                        self.write_expr(arg, ctx)?;
+                        write!(self.out, ", int(min(")?;
+                        self.write_expr(arg1.unwrap(), ctx)?;
+                        write!(self.out, ", {scalar_bits}u)), int(min(",)?;
+                        self.write_expr(arg2.unwrap(), ctx)?;
+                        write!(self.out, ", {scalar_bits}u - min(")?;
+                        self.write_expr(arg1.unwrap(), ctx)?;
+                        write!(self.out, ", {scalar_bits}u))))")?;
+
+                        return Ok(());
+                    }
+                    Mf::InsertBits => {
+                        // InsertBits has the same considerations as ExtractBits above
+                        let scalar_bits = ctx
+                            .resolve_type(arg, &self.module.types)
+                            .scalar_width()
+                            .unwrap();
+
+                        write!(self.out, "bitfieldInsert(")?;
+                        self.write_expr(arg, ctx)?;
+                        write!(self.out, ", ")?;
+                        self.write_expr(arg1.unwrap(), ctx)?;
+                        write!(self.out, ", int(min(")?;
+                        self.write_expr(arg2.unwrap(), ctx)?;
+                        write!(self.out, ", {scalar_bits}u)), int(min(",)?;
+                        self.write_expr(arg3.unwrap(), ctx)?;
+                        write!(self.out, ", {scalar_bits}u - min(")?;
+                        self.write_expr(arg2.unwrap(), ctx)?;
+                        write!(self.out, ", {scalar_bits}u))))")?;
+
+                        return Ok(());
+                    }
                     Mf::FindLsb => "findLSB",
                     Mf::FindMsb => "findMSB",
                     // data packing

diff --git a/naga/src/back/hlsl/help.rs b/naga/src/back/hlsl/help.rs
@@ -26,7 +26,11 @@ int dim_1d = NagaDimensions1D(image_1d);
 ```
 */
 
-use super::{super::FunctionCtx, BackendResult};
+use super::{
+    super::FunctionCtx,
+    writer::{EXTRACT_BITS_FUNCTION, INSERT_BITS_FUNCTION},
+    BackendResult,
+};
 use crate::{arena::Handle, proc::NameKey};
 use std::fmt::Write;
 
@@ -59,6 +63,13 @@ pub(super) struct WrappedMatCx2 {
     pub(super) columns: crate::VectorSize,
 }
 
+#[derive(Clone, Copy, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
+pub(super) struct WrappedMath {
+    pub(super) fun: crate::MathFunction,
+    pub(super) scalar: crate::Scalar,
+    pub(super) components: Option<u32>,
+}
+
 /// HLSL backend requires its own `ImageQuery` enum.
 ///
 /// It is used inside `WrappedImageQuery` and should be unique per ImageQuery function.
@@ -851,12 +862,149 @@ impl<'a, W: Write> super::Writer<'a, W> {
         Ok(())
     }
 
+    pub(super) fn write_wrapped_math_functions(
+        &mut self,
+        module: &crate::Module,
+        func_ctx: &FunctionCtx,
+    ) -> BackendResult {
+        for (_, expression) in func_ctx.expressions.iter() {
+            if let crate::Expression::Math {
+                fun,
+                arg,
+                arg1: _arg1,
+                arg2: _arg2,
+                arg3: _arg3,
+            } = *expression
+            {
+                match fun {
+                    crate::MathFunction::ExtractBits => {
+                        // The behavior of our extractBits polyfill is undefined if offset + count > bit_width. We need
+                        // to first sanitize the offset and count first. If we don't do this, we will get out-of-spec
+                        // values if the extracted range is not within the bit width.
+                        //
+                        // This encodes the exact formula specified by the wgsl spec:
+                        // https://gpuweb.github.io/gpuweb/wgsl/#extractBits-unsigned-builtin
+                        //
+                        // w = sizeof(x) * 8
+                        // o = min(offset, w)
+                        // c = min(count, w - o)
+                        //
+                        // bitfieldExtract(x, o, c)
+                        let arg_ty = func_ctx.resolve_type(arg, &module.types);
+                        let scalar = arg_ty.scalar().unwrap();
+                        let components = arg_ty.components();
+
+                        let wrapped = WrappedMath {
+                            fun,
+                            scalar,
+                            components,
+                        };
+
+                        if !self.wrapped.math.insert(wrapped) {
+                            continue;
+                        }
+
+                        // Write return type
+                        self.write_value_type(module, arg_ty)?;
+
+                        let scalar_width: u8 = scalar.width * 8;
+
+                        // Write function name and parameters
+                        writeln!(self.out, " {EXTRACT_BITS_FUNCTION}(")?;
+                        write!(self.out, "    ")?;
+                        self.write_value_type(module, arg_ty)?;
+                        writeln!(self.out, " e,")?;
+                        writeln!(self.out, "    uint offset,")?;
+                        writeln!(self.out, "    uint count")?;
+                        writeln!(self.out, ") {{")?;
+
+                        // Write function body
+                        writeln!(self.out, "    uint w = {scalar_width};")?;
+                        writeln!(self.out, "    uint o = min(offset, w);")?;
+                        writeln!(self.out, "    uint c = min(count, w - o);")?;
+                        writeln!(
+                            self.out,
+                            "    return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c));"
+                        )?;
+
+                        // End of function body
+                        writeln!(self.out, "}}")?;
+                    }
+                    crate::MathFunction::InsertBits => {
+                        // The behavior of our insertBits polyfill has the same constraints as the extractBits polyfill.
+
+                        let arg_ty = func_ctx.resolve_type(arg, &module.types);
+                        let scalar = arg_ty.scalar().unwrap();
+                        let components = arg_ty.components();
+
+                        let wrapped = WrappedMath {
+                            fun,
+                            scalar,
+                            components,
+                        };
+
+                        if !self.wrapped.math.insert(wrapped) {
+                            continue;
+                        }
+
+                        // Write return type
+                        self.write_value_type(module, arg_ty)?;
+
+                        let scalar_width: u8 = scalar.width * 8;
+                        let scalar_max: u64 = match scalar.width {
+                            1 => 0xFF,
+                            2 => 0xFFFF,
+                            4 => 0xFFFFFFFF,
+                            8 => 0xFFFFFFFFFFFFFFFF,
+                            _ => unreachable!(),
+                        };
+
+                        // Write function name and parameters
+                        writeln!(self.out, " {INSERT_BITS_FUNCTION}(")?;
+                        write!(self.out, "    ")?;
+                        self.write_value_type(module, arg_ty)?;
+                        writeln!(self.out, " e,")?;
+                        write!(self.out, "    ")?;
+                        self.write_value_type(module, arg_ty)?;
+                        writeln!(self.out, " newbits,")?;
+                        writeln!(self.out, "    uint offset,")?;
+                        writeln!(self.out, "    uint count")?;
+                        writeln!(self.out, ") {{")?;
+
+                        // Write function body
+                        writeln!(self.out, "    uint w = {scalar_width}u;")?;
+                        writeln!(self.out, "    uint o = min(offset, w);")?;
+                        writeln!(self.out, "    uint c = min(count, w - o);")?;
+
+                        // The `u` suffix on the literals is _extremely_ important. Otherwise it will use
+                        // i32 shifting instead of the intended u32 shifting.
+                        writeln!(
+                            self.out,
+                            "    uint mask = (({scalar_max}u >> ({scalar_width}u - c)) << o);"
+                        )?;
+                        writeln!(
+                            self.out,
+                            "    return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask)));"
+                        )?;
+
+                        // End of function body
+                        writeln!(self.out, "}}")?;
+                    }
+                    _ => {}
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     /// Helper function that writes various wrapped functions
     pub(super) fn write_wrapped_functions(
         &mut self,
         module: &crate::Module,
         func_ctx: &FunctionCtx,
     ) -> BackendResult {
+        self.write_wrapped_math_functions(module, func_ctx)?;
         self.write_wrapped_compose_functions(module, func_ctx.expressions)?;
 
         for (handle, _) in func_ctx.expressions.iter() {

diff --git a/naga/src/back/hlsl/keywords.rs b/naga/src/back/hlsl/keywords.rs
@@ -817,6 +817,8 @@ pub const RESERVED: &[&str] = &[
     // Naga utilities
     super::writer::MODF_FUNCTION,
     super::writer::FREXP_FUNCTION,
+    super::writer::EXTRACT_BITS_FUNCTION,
+    super::writer::INSERT_BITS_FUNCTION,
 ];
 
 // DXC scalar types, from https://github.com/microsoft/DirectXShaderCompiler/blob/18c9e114f9c314f93e68fbc72ce207d4ed2e65ae/tools/clang/lib/AST/ASTContextHLSL.cpp#L48-L254

diff --git a/naga/src/back/hlsl/mod.rs b/naga/src/back/hlsl/mod.rs
@@ -256,6 +256,7 @@ struct Wrapped {
     constructors: crate::FastHashSet<help::WrappedConstructor>,
     struct_matrix_access: crate::FastHashSet<help::WrappedStructMatrixAccess>,
     mat_cx2s: crate::FastHashSet<help::WrappedMatCx2>,
+    math: crate::FastHashSet<help::WrappedMath>,
 }
 
 impl Wrapped {
@@ -265,6 +266,7 @@ impl Wrapped {
         self.constructors.clear();
         self.struct_matrix_access.clear();
         self.mat_cx2s.clear();
+        self.math.clear();
     }
 }