Fixes oven-sh#7503 (oven-sh#7511)

Co-authored-by: Jarred Sumner <709451+Jarred-Sumner@users.noreply.github.com>
Hanaasagi · Dec 7, 2023 · 653e293 · 653e293
1 parent 798f548
commit 653e293
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 24 deletions.
diff --git a/src/bun.js/webcore/blob.zig b/src/bun.js/webcore/blob.zig
@@ -3815,15 +3815,17 @@ pub const Blob = struct {
         return this.store != null and this.store.?.data == .file;
     }
 
-    pub fn toStringWithBytes(this: *Blob, global: *JSGlobalObject, buf: []const u8, comptime lifetime: Lifetime) JSValue {
+    pub fn toStringWithBytes(this: *Blob, global: *JSGlobalObject, buf_: []const u8, comptime lifetime: Lifetime) JSValue {
         // null == unknown
         // false == can't be
         const could_be_all_ascii = this.is_all_ascii orelse this.store.?.is_all_ascii;
 
+        const buf = strings.withoutUTF8BOM(buf_);
+
         if (could_be_all_ascii == null or !could_be_all_ascii.?) {
             // if toUTF16Alloc returns null, it means there are no non-ASCII characters
             // instead of erroring, invalid characters will become a U+FFFD replacement character
-            if (strings.toUTF16AllocAllowBOM(bun.default_allocator, buf, false, true) catch unreachable) |external| {
+            if (strings.toUTF16Alloc(bun.default_allocator, buf, false) catch unreachable) |external| {
                 if (lifetime != .temporary)
                     this.setIsASCIIFlag(false);
 
@@ -3850,21 +3852,36 @@ pub const Blob = struct {
             // we don't need to clone
             .clone => {
                 this.store.?.ref();
+                // we don't need to worry about UTF-8 BOM in this case because the store owns the memory.
                 return ZigString.init(buf).external(global, this.store.?, Store.external);
             },
             .transfer => {
                 var store = this.store.?;
                 std.debug.assert(store.data == .bytes);
                 this.transfer();
+                // we don't need to worry about UTF-8 BOM in this case because the store owns the memory.
                 return ZigString.init(buf).external(global, store, Store.external);
             },
             // strings are immutable
             // sharing isn't really a thing
             .share => {
                 this.store.?.ref();
+                // we don't need to worry about UTF-8 BOM in this case because the store owns the memory.s
                 return ZigString.init(buf).external(global, this.store.?, Store.external);
             },
             .temporary => {
+                // if there was a UTF-8 BOM, we need to clone the buffer because
+                // external doesn't support this case here yet.
+                if (buf.len != buf_.len) {
+                    var out = bun.String.createLatin1(buf);
+                    defer {
+                        bun.default_allocator.free(buf_);
+                        out.deref();
+                    }
+
+                    return out.toJS(global);
+                }
+
                 return ZigString.init(buf).toExternalValue(global);
             },
         }
@@ -3894,7 +3911,8 @@ pub const Blob = struct {
         return toJSONWithBytes(this, global, view_, lifetime);
     }
 
-    pub fn toJSONWithBytes(this: *Blob, global: *JSGlobalObject, buf: []const u8, comptime lifetime: Lifetime) JSValue {
+    pub fn toJSONWithBytes(this: *Blob, global: *JSGlobalObject, buf_: []const u8, comptime lifetime: Lifetime) JSValue {
+        const buf = strings.withoutUTF8BOM(buf_);
         if (buf.len == 0) return global.createSyntaxErrorInstance("Unexpected end of JSON input", .{});
         // null == unknown
         // false == can't be
@@ -3905,7 +3923,7 @@ pub const Blob = struct {
             var stack_fallback = std.heap.stackFallback(4096, bun.default_allocator);
             const allocator = stack_fallback.get();
             // if toUTF16Alloc returns null, it means there are no non-ASCII characters
-            if (strings.toUTF16AllocAllowBOM(allocator, buf, false, true) catch null) |external| {
+            if (strings.toUTF16Alloc(allocator, buf, false) catch null) |external| {
                 if (comptime lifetime != .temporary) this.setIsASCIIFlag(false);
                 const result = ZigString.init16(external).toJSONObject(global);
                 allocator.free(external);
@@ -4534,11 +4552,19 @@ pub const InternalBlob = struct {
     was_string: bool = false,
 
     pub fn toStringOwned(this: *@This(), globalThis: *JSC.JSGlobalObject) JSValue {
-        if (strings.toUTF16AllocAllowBOM(globalThis.allocator(), this.bytes.items, false, true) catch &[_]u16{}) |out| {
+        const bytes_without_bom = strings.withoutUTF8BOM(this.bytes.items);
+        if (strings.toUTF16Alloc(globalThis.allocator(), bytes_without_bom, false) catch &[_]u16{}) |out| {
             const return_value = ZigString.toExternalU16(out.ptr, out.len, globalThis);
             return_value.ensureStillAlive();
             this.deinit();
             return return_value;
+        } else if
+        // If there was a UTF8 BOM, we clone it
+        (bytes_without_bom.len != this.bytes.items.len) {
+            defer this.deinit();
+            var out = bun.String.createLatin1(this.bytes.items[3..]);
+            defer out.deref();
+            return out.toJS(globalThis);
         } else {
             var str = ZigString.init(this.toOwnedSlice());
             str.mark();

diff --git a/src/string_immutable.zig b/src/string_immutable.zig
@@ -1313,13 +1313,9 @@ pub fn copyLatin1IntoASCII(dest: []u8, src: []const u8) void {
 
 const utf8_bom = [_]u8{ 0xef, 0xbb, 0xbf };
 
-pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 {
-    return toUTF16AllocAllowBOM(allocator, bytes, fail_if_invalid, false);
-}
-
 pub fn withoutUTF8BOM(bytes: []const u8) []const u8 {
-    if (bytes.len > 3 and strings.eqlComptime(bytes[0..3], utf8_bom)) {
-        return bytes[3..];
+    if (strings.hasPrefixComptime(bytes, utf8_bom)) {
+        return bytes[utf8_bom.len..];
     } else {
         return bytes;
     }
@@ -1328,20 +1324,8 @@ pub fn withoutUTF8BOM(bytes: []const u8) []const u8 {
 /// Convert a UTF-8 string to a UTF-16 string IF there are any non-ascii characters
 /// If there are no non-ascii characters, this returns null
 /// This is intended to be used for strings that go to JavaScript
-pub fn toUTF16AllocAllowBOM(allocator: std.mem.Allocator, bytes_: []const u8, comptime fail_if_invalid: bool, comptime allow_bom: bool) !?[]u16 {
-    var bytes = bytes_;
+pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 {
     if (strings.firstNonASCII(bytes)) |i| {
-        if (comptime allow_bom) {
-            // we could avoid the allocation here when it's otherwise ASCII. But
-            // it gets really complicated because most memory allocators need
-            // the head pointer to be the allocated one so if we instead return
-            // a non-head pointer and try to free that the allocator might not
-            // be able to free it, and we would have a big problem.
-            if (i == 0 and bytes.len > 3 and strings.eqlComptime(bytes[0..3], utf8_bom)) {
-                bytes = bytes[3..];
-            }
-        }
-
         const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: {
             const trimmed = bun.simdutf.trim.utf8(bytes);
 

diff --git a/test/js/web/fetch/utf8-bom.test.ts b/test/js/web/fetch/utf8-bom.test.ts
@@ -1,7 +1,39 @@
 import { describe, expect, it, test } from "bun:test";
 
 describe("UTF-8 BOM should be ignored", () => {
+  test("handles empty strings", async () => {
+    const blob = new Response(new Blob([Buffer.from([0xef, 0xbb, 0xbf])]));
+
+    expect(await blob.text()).toHaveLength(0);
+    expect(async () => await blob.json()).toThrow();
+  });
+
+  test("handles UTF8 BOM + emoji", async () => {
+    const blob = new Response(new Blob([Buffer.from([0xef, 0xbb, 0xbf]), Buffer.from("🌎")]));
+
+    expect(await blob.text()).toHaveLength(2);
+    expect(async () => await blob.json()).toThrow();
+  });
+
   describe("Blob", () => {
+    describe("with emoji", () => {
+      it("in text()", async () => {
+        const blob = new Blob(["\uFEFFHello, World! 🌎"], { type: "text/plain" });
+        expect(await blob.text()).toBe("Hello, World! 🌎");
+      });
+
+      it("in json()", async () => {
+        const blob = new Blob(['\uFEFF{"hello":"World 🌎"}'], { type: "application/json" });
+        expect(await blob.json()).toStrictEqual({ "hello": "World 🌎" } as any);
+      });
+
+      it("in formData()", async () => {
+        const blob = new Blob(["\uFEFFhello=world 🌎"], { type: "application/x-www-form-urlencoded" });
+        const formData = await blob.formData();
+        expect(formData.get("hello")).toBe("world 🌎");
+      });
+    });
+
     it("in text()", async () => {
       const blob = new Blob(["\uFEFFHello, World!"], { type: "text/plain" });
       expect(await blob.text()).toBe("Hello, World!");